Arret sur images by Francois D

2025-07-09 03:04:10 -04:00 · 2013-03-31 07:21:58 +05:30 · 2013-03-31 07:21:58 +05:30 · a671413989
commit a671413989
parent 9be9b8fb36
3 changed files with 62 additions and 4 deletions
--- a/recipes/arret_sur_images.recipe
+++ b/recipes/arret_sur_images.recipe
@ -0,0 +1,54 @@
+from __future__ import unicode_literals
+
+__license__ = 'WTFPL'
+__author__ = '2013, François D. <franek at chicour.net>'
+__description__ = 'Get some fresh news from Arrêt sur images'
+
+
+from calibre.web.feeds.recipes import BasicNewsRecipe
+
+class Asi(BasicNewsRecipe):
+
+    title       = 'Arrêt sur images'
+    __author__  = 'François D. (aka franek)'
+    description = 'Global news in french from news site "Arrêt sur images"'
+
+    oldest_article = 7.0
+    language = 'fr'
+    needs_subscription = True
+    max_articles_per_feed = 100
+
+    simultaneous_downloads = 1
+    timefmt = '[%a, %d %b %Y %I:%M +0200]'
+    cover_url = 'http://www.arretsurimages.net/images/header/menu/menu_1.png'
+
+    use_embedded_content = False
+    no_stylesheets = True
+    remove_javascript = True
+
+    feeds =  [
+        ('vite dit et gratuit', 'http://www.arretsurimages.net/vite-dit.rss'),
+        ('Toutes les chroniques', 'http://www.arretsurimages.net/chroniques.rss'),
+        ('Contenus et dossiers', 'http://www.arretsurimages.net/dossiers.rss'),
+    ]
+
+    conversion_options = { 'smarten_punctuation' : True }
+
+    remove_tags = [dict(id='vite-titre'), dict(id='header'), dict(id='wrap-connexion'), dict(id='col_right'), dict(name='div', attrs={'class':'bloc-chroniqueur-2'}), dict(id='footercontainer')]
+
+    def print_version(self, url):
+        return url.replace('contenu.php', 'contenu-imprimable.php')
+
+    def get_browser(self):
+        # Need to use robust HTML parser
+        br = BasicNewsRecipe.get_browser(self, use_robust_parser=True)
+        if self.username is not None and self.password is not None:
+            br.open('http://www.arretsurimages.net/index.php')
+            br.select_form(nr=0)
+            br.form.set_all_readonly(False)
+            br['redir'] = 'forum/login.php'
+            br['username'] = self.username
+            br['password'] = self.password
+            br.submit()
+        return br
+
--- a/src/calibre/init.py
+++ b/src/calibre/init.py
@ -376,7 +376,7 @@ def random_user_agent(choose=None):
        choose = random.randint(0, len(choices)-1)
    return choices[choose]

-def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None):
+def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None, use_robust_parser=False):
    '''
    Create a mechanize browser for web scraping. The browser handles cookies,
    refresh requests and ignores robots.txt. Also uses proxy if available.
@ -385,7 +385,11 @@ def browser(honor_time=True, max_time=2, mobile_browser=False, user_agent=None):
    :param max_time: Maximum time in seconds to wait during a refresh request
    '''
    from calibre.utils.browser import Browser
-    opener = Browser()
+    if use_robust_parser:
+        import mechanize
+        opener = Browser(factory=mechanize.RobustFactory())
+    else:
+        opener = Browser()
    opener.set_handle_refresh(True, max_time=max_time, honor_time=honor_time)
    opener.set_handle_robots(False)
    if user_agent is None:
--- a/src/calibre/utils/browser.py
+++ b/src/calibre/utils/browser.py
@ -17,10 +17,10 @@ class Browser(B):
    cookie jar. All clones share the same browser configuration.
    '''

-    def __init__(self):
+    def __init__(self, *args, **kwargs):
        self._clone_actions = {}

-        B.__init__(self)
+        B.__init__(self, *args, **kwargs)
        self.set_cookiejar(CookieJar())

    def set_handle_refresh(self, *args, **kwargs):