Update Read It Later

2025-07-09 03:04:10 -04:00 · 2013-05-26 20:40:29 +05:30 · 2013-05-26 20:40:29 +05:30 · 55a11e5037
commit 55a11e5037
parent 072ce4c5b7
1 changed files with 90 additions and 60 deletions
--- a/recipes/readitlater.recipe
+++ b/recipes/readitlater.recipe
@ -1,61 +1,65 @@
 """
-Pocket Calibre Recipe v1.3
+Pocket Calibre Recipe v1.4
 """
 from calibre import strftime
 from calibre.web.feeds.news import BasicNewsRecipe
-import urllib2
-import urllib
+from string import Template
 import json
 import operator
-import tempfile
 import re
+import tempfile
+import urllib
+import urllib2

-__license__   = 'GPL v3'
+
+__license__ = 'GPL v3'
 __copyright__ = '''
 2010, Darko Miletic <darko.miletic at gmail.com>
 2011, Przemyslaw Kryger <pkryger at gmail.com>
-2012, tBunnyMan <Wag That Tail At Me dot com>
+2012-2013, tBunnyMan <Wag That Tail At Me dot com>
 '''


 class Pocket(BasicNewsRecipe):
-    title                 = 'Pocket'
-    __author__            = 'Darko Miletic, Przemyslaw Kryger, Keith Callenberg, tBunnyMan'
-    description           = '''Personalized news feeds. Go to getpocket.com to setup up \
-                            your news. This version displays pages of articles from \
-                            oldest to newest, with max & minimum counts, and marks articles \
-                            read after downloading.'''
-    publisher             = 'getpocket.com'
-    category              = 'news, custom'
+    title = 'Pocket'
+    __author__ = 'Darko Miletic, Przemyslaw Kryger, Keith Callenberg, tBunnyMan'
+    description = '''Personalized news feeds. Go to getpocket.com to setup up
+                  your news. This version displays pages of articles from
+                  oldest to newest, with max & minimum counts, and marks
+                  articles read after downloading.'''
+    publisher = 'getpocket.com'
+    category = 'news, custom'
+
+    #Settings people change
    max_articles_per_feed = 50
    minimum_articles      = 10
-    #Set this to False for testing
-    mark_as_read_after_dl = False
-    #MUST be either 'oldest' or 'newest'
-    sort_method           = 'oldest'
-    #To filter by tag this needs to be a single tag in quotes; IE 'calibre'
+    mark_as_read_after_dl = True  # Set this to False for testing
+    sort_method           = 'oldest'  # MUST be either 'oldest' or 'newest'
+    # To filter by tag this needs to be a single tag in quotes; IE 'calibre'
    only_pull_tag         = None

-    #You don't want to change anything under here unless you REALLY know what you are doing
-    no_stylesheets        = True
-    use_embedded_content  = False
-    needs_subscription    = True
+    #You don't want to change anything under
+    no_stylesheets = True
+    use_embedded_content = False
+    needs_subscription = True
    articles_are_obfuscated = True
-    apikey                = '19eg0e47pbT32z4793Tf021k99Afl889'
-    index_url             = u'http://getpocket.com'
-    ajax_url              = u'http://getpocket.com/a/x/getArticle.php'
-    read_api_url          = index_url + u'/v3/get'
-    modify_api_url        = index_url + u'/v3/send'
-    legacy_login_url      = index_url + u'/l'  # We use this to cheat oAuth
-    articles              = []
+    apikey = '19eg0e47pbT32z4793Tf021k99Afl889'
+    index_url = u'http://getpocket.com'
+    read_api_url = index_url + u'/v3/get'
+    modify_api_url = index_url + u'/v3/send'
+    legacy_login_url = index_url + u'/l'  # We use this to cheat oAuth
+    articles = []

    def get_browser(self, *args, **kwargs):
        """
-        We need to pretend to be a recent version of safari for the mac to prevent User-Agent checks
-        Pocket api requires username and password so fail loudly if it's missing from the config.
+        We need to pretend to be a recent version of safari for the mac to
+        prevent User-Agent checks Pocket api requires username and password so
+        fail loudly if it's missing from the config.
        """
        br = BasicNewsRecipe.get_browser(self,
-            user_agent='Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-us) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4')
+                user_agent='Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; \
+                        en-us) AppleWebKit/533.19.4 (KHTML, like Gecko) \
+                        Version/5.0.3 Safari/533.19.4')
        if self.username is not None and self.password is not None:
            br.open(self.legacy_login_url)
            br.select_form(nr=0)
@ -63,7 +67,7 @@ class Pocket(BasicNewsRecipe):
            br['password'] = self.password
            br.submit()
        else:
-            self.user_error("This Recipe requires authentication, please configured user & pass")
+            self.user_error("This Recipe requires authentication")
        return br

    def get_auth_uri(self):
@ -71,21 +75,20 @@ class Pocket(BasicNewsRecipe):
        uri = ""
        uri = u'{0}&apikey={1!s}'.format(uri, self.apikey)
        if self.username is None or self.password is None:
-            self.user_error("Username or password is blank. Pocket no longer supports blank passwords")
+            self.user_error("Username or password is blank.")
        else:
            uri = u'{0}&username={1!s}'.format(uri, self.username)
            uri = u'{0}&password={1!s}'.format(uri, self.password)
        return uri

    def get_pull_articles_uri(self):
-        """Return the part of the uri that has all of the get request settings"""
        uri = ""
-        uri = u'{0}&state={1}'.format(uri, u'unread')  # TODO This could be modded to allow pulling archives
-        uri = u'{0}&contentType={1}'.format(uri, u'article')  # TODO This COULD return images too
+        uri = u'{0}&state={1}'.format(uri, u'unread')
+        uri = u'{0}&contentType={1}'.format(uri, u'article')
        uri = u'{0}&sort={1}'.format(uri, self.sort_method)
        uri = u'{0}&count={1!s}'.format(uri, self.max_articles_per_feed)
        if self.only_pull_tag is not None:
-            uri = u'{0}tag={1}'.format(uri, self.only_pull_tag)
+            uri = u'{0}&tag={1}'.format(uri, self.only_pull_tag)
        return uri

    def parse_index(self):
@ -100,11 +103,12 @@ class Pocket(BasicNewsRecipe):
            response = urllib2.urlopen(request)
            pocket_feed = json.load(response)['list']
        except urllib2.HTTPError as e:
-            self.log.exception("Pocket returned an error: {0}\nurl: {1}".format(e, fetch_url))
+            self.log.exception("Pocket returned an error: {0}".format(e.info()))
            return []
        except urllib2.URLError as e:
            self.log.exception("Unable to connect to getpocket.com's api: {0}\nurl: {1}".format(e, fetch_url))
            return []
+
        if len(pocket_feed) < self.minimum_articles:
            self.mark_as_read_after_dl = False
            self.user_error("Only {0} articles retrieved, minimum_articles not reached".format(len(pocket_feed)))
@ -120,39 +124,65 @@ class Pocket(BasicNewsRecipe):
                'sort':         pocket_article[1]['sort_id']
            })
        self.articles = sorted(self.articles, key=operator.itemgetter('sort'))
-        print self.articles
        return [("My Pocket Articles for {0}".format(strftime('[%I:%M %p]')), self.articles)]

-    def get_obfuscated_article(self, url):
+    def get_textview(self, url):
+        """
+        Since Pocket's v3 API they removed access to textview. They also
+         redesigned their page to make it much harder to scrape their textview.
+         We need to pull the article, retrieve the formcheck id, then use it
+         to querty for the json version
+        This function will break when pocket hates us
+        """
+        ajax_url = self.index_url + u'/a/x/getArticle.php'
        soup = self.index_to_soup(url)
-        formcheck_script_tag = soup.find('script', text=re.compile("formCheck"))
-        form_check = formcheck_script_tag.split("=")[1].replace("'", "").replace(";", "").strip()
+        fc_tag = soup.find('script', text=re.compile("formCheck"))
+        fc_id = re.search(r"\'([\d\w]+)\'", fc_tag).group(1)
        article_id = url.split("/")[-1]
-        data = urllib.urlencode({'itemId': article_id, 'formCheck': form_check})
-        response = self.browser.open(self.ajax_url, data)
-        article_json = json.load(response)['article']['article']
+        data = urllib.urlencode({'itemId': article_id, 'formCheck': fc_id})
+        try:
+            response = self.browser.open(ajax_url, data)
+        except urllib2.HTTPError as e:
+            self.log.exception("unable to get textview {0}".format(e.info()))
+            raise e
+        return json.load(response)['article']
+
+    def get_obfuscated_article(self, url):
+        """
+        Our get_textview returns parsed json so prettify it to something well
+        parsed by calibre.
+        """
+        article = self.get_textview(url)
+        template = Template('<h1>$title</h1>\
+                $img\
+                <div class="body">$body</div>')
+        try:
+            image = '<img src="{0}" \>'.format(article['images']['1']['src'])
+        except:
+            image = ''
        with tempfile.NamedTemporaryFile(delete=False) as tf:
-            tf.write(article_json)
+            tf.write(template.safe_substitute(
+                title=article['title'],
+                img=image,
+                body=article['article']
+                ))
        return tf.name

    def mark_as_read(self, mark_list):
-        formatted_list = []
+        actions_list = []
        for article_id in mark_list:
-            formatted_list.append({
+            actions_list.append({
                'action': 'archive',
                'item_id': article_id
            })
-        command = {
-            'actions': formatted_list
-        }
-        mark_read_url = u'{0}?{1}'.format(
+        mark_read_url = u'{0}?actions={1}{2}'.format(
            self.modify_api_url,
+            json.dumps(actions_list, separators=(',', ':')),
            self.get_auth_uri()
        )
        try:
-            request = urllib2.Request(mark_read_url, json.dumps(command))
-            response = urllib2.urlopen(request)
-            print u'response = {0}'.format(response.info())
+            request = urllib2.Request(mark_read_url)
+            urllib2.urlopen(request)
        except urllib2.HTTPError as e:
            self.log.exception('Pocket returned an error while archiving articles: {0}'.format(e))
            return []
@ -162,7 +192,7 @@ class Pocket(BasicNewsRecipe):

    def cleanup(self):
        if self.mark_as_read_after_dl:
-            self.mark_as_read([x[1]['item_id'] for x in self.articles])
+            self.mark_as_read([x['item_id'] for x in self.articles])
        else:
            pass

@ -174,7 +204,7 @@ class Pocket(BasicNewsRecipe):
        try:
            from calibre.ebooks import calibre_cover
            title = self.title if isinstance(self.title, unicode) else \
-                    self.title.decode('utf-8', 'replace')
+                self.title.decode('utf-8', 'replace')
            date = strftime(self.timefmt)
            time = strftime('[%I:%M %p]')
            img_data = calibre_cover(title, date, time)
@ -192,4 +222,4 @@ class Pocket(BasicNewsRecipe):
            self.log.exception(error_message)
            raise RuntimeError(error_message)

-# vim:ft=python
+# vim:ft=python tabstop=8 expandtab shiftwidth=4 softtabstop=4