diff --git a/recipes/arret_sur_images.recipe b/recipes/arret_sur_images.recipe new file mode 100644 index 0000000000..fac2983231 --- /dev/null +++ b/recipes/arret_sur_images.recipe @@ -0,0 +1,54 @@ +from __future__ import unicode_literals + +__license__ = 'WTFPL' +__author__ = '2013, François D. ' +__description__ = 'Get some fresh news from Arrêt sur images' + + +from calibre.web.feeds.recipes import BasicNewsRecipe + +class Asi(BasicNewsRecipe): + + title = 'Arrêt sur images' + __author__ = 'François D. (aka franek)' + description = 'Global news in french from news site "Arrêt sur images"' + + oldest_article = 7.0 + language = 'fr' + needs_subscription = True + max_articles_per_feed = 100 + + simultaneous_downloads = 1 + timefmt = '[%a, %d %b %Y %I:%M +0200]' + cover_url = 'http://www.arretsurimages.net/images/header/menu/menu_1.png' + + use_embedded_content = False + no_stylesheets = True + remove_javascript = True + + feeds = [ + ('vite dit et gratuit', 'http://www.arretsurimages.net/vite-dit.rss'), + ('Toutes les chroniques', 'http://www.arretsurimages.net/chroniques.rss'), + ('Contenus et dossiers', 'http://www.arretsurimages.net/dossiers.rss'), + ] + + conversion_options = { 'smarten_punctuation' : True } + + remove_tags = [dict(id='vite-titre'), dict(id='header'), dict(id='wrap-connexion'), dict(id='col_right'), dict(name='div', attrs={'class':'bloc-chroniqueur-2'}), dict(id='footercontainer')] + + def print_version(self, url): + return url.replace('contenu.php', 'contenu-imprimable.php') + + def get_browser(self): + # Need to use robust HTML parser + br = BasicNewsRecipe.get_browser(self, use_robust_parser=True) + if self.username is not None and self.password is not None: + br.open('http://www.arretsurimages.net/index.php') + br.select_form(nr=0) + br.form.set_all_readonly(False) + br['redir'] = 'forum/login.php' + br['username'] = self.username + br['password'] = self.password + br.submit() + return br + diff --git a/src/calibre/__init__.py b/src/calibre/__init__.py index 7f79877bd5..bd7d01c0a0 100644 --- a/src/calibre/__init__.py +++ b/src/calibre/__init__.py @@ -376,7 +376,7 @@ def random_user_agent(choose=None): choose = random.randint(0, len(choices)-1) return choices[choose] -def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None): +def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None, use_robust_parser=False): ''' Create a mechanize browser for web scraping. The browser handles cookies, refresh requests and ignores robots.txt. Also uses proxy if available. @@ -385,7 +385,11 @@ def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None): :param max_time: Maximum time in seconds to wait during a refresh request ''' from calibre.utils.browser import Browser - opener = Browser() + if use_robust_parser: + import mechanize + opener = Browser(factory=mechanize.RobustFactory()) + else: + opener = Browser() opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time) opener.set_handle_robots(False) if user_agent is None: diff --git a/src/calibre/utils/browser.py b/src/calibre/utils/browser.py index fc04044ad3..4ee1478d73 100644 --- a/src/calibre/utils/browser.py +++ b/src/calibre/utils/browser.py @@ -17,10 +17,10 @@ class Browser(B): cookie jar. All clones share the same browser configuration. ''' - def __init__(self): + def __init__(self, *args, **kwargs): self._clone_actions = {} - B.__init__(self) + B.__init__(self, *args, **kwargs) self.set_cookiejar(CookieJar()) def set_handle_refresh(self, *args, **kwargs):