From 6590bcd76e398d757a0e708c32e618cd11a1fbfd Mon Sep 17 00:00:00 2001 From: Gerard Ryan Date: Sat, 11 Dec 2021 22:52:20 +0000 Subject: [PATCH] Fix Pocket/readitlater recipe This recipe wasn't working (at least for me), where it would consistently fail with the following error when trying to download an article: ``` Traceback (most recent call last): File "calibre/utils/threadpool.py", line 100, in run File "calibre/web/feeds/news.py", line 1186, in fetch_obfuscated_article File "", line 157, in get_obfuscated_article File "", line 142, in get_textview File "re.py", line 201, in search TypeError: expected string or bytes-like object ``` I believe this is because Pocket don't allow access to their "Article View" API by default: https://getpocket.com/developer/docs/v3/article-view This change uses the original URL of the article, rather than the pocket url for it (those `getpocket.com/a/read/` URLs in the browser seem to just redirect me to `getpocket.com/my-list`). I've a feeling that the old way might have produced cleaner articles (Pocket cleanup & then Calibre cleanup), but I've never seen it work successfully. We could alternatively try to convince Pocket to enable that API for the "app" we use here (I think this usage would qualify, as it's a "Pocket specific feature" (for Calibre)). That might require adopting their full OAuth flow, rather than using username/password. From reading their API docs with Calibre in mind, I think that would mean we'd have to have a web page somewhere that the user would have to access in their normal web browser, click a button to redirect them to the Pocket page to authorize the Calibre app to access their Pocket account, which would then redirect them back to our web page where we'd instruct them to enter the access token into the "password" field for the recipe in Calibre. --- recipes/readitlater.recipe | 51 +++----------------------------------- 1 file changed, 4 insertions(+), 47 deletions(-) diff --git a/recipes/readitlater.recipe b/recipes/readitlater.recipe index ae3f608801..82ebc0a9de 100644 --- a/recipes/readitlater.recipe +++ b/recipes/readitlater.recipe @@ -1,5 +1,5 @@ """ -Pocket Calibre Recipe v1.4 +Pocket Calibre Recipe v1.5 """ from calibre import strftime from calibre.web.feeds.news import BasicNewsRecipe @@ -39,7 +39,7 @@ class Pocket(BasicNewsRecipe): # Settings people change oldest_article = 7.0 max_articles_per_feed = 50 - minimum_articles = 10 + minimum_articles = 1 mark_as_read_after_dl = True # Set this to False for testing sort_method = 'oldest' # MUST be either 'oldest' or 'newest' # To filter by tag this needs to be a single tag in quotes; IE 'calibre' @@ -49,7 +49,7 @@ class Pocket(BasicNewsRecipe): no_stylesheets = True use_embedded_content = False needs_subscription = True - articles_are_obfuscated = True + articles_are_obfuscated = False apikey = '19eg0e47pbT32z4793Tf021k99Afl889' index_url = u'https://getpocket.com' read_api_url = index_url + u'/v3/get' @@ -118,7 +118,7 @@ class Pocket(BasicNewsRecipe): 'item_id': pocket_article[0], 'title': pocket_article[1]['resolved_title'], 'date': pocket_article[1]['time_updated'], - 'url': u'{0}/a/read/{1}'.format(self.index_url, pocket_article[0]), + 'url': pocket_article[1]['resolved_url'], 'real_url': pocket_article[1]['resolved_url'], 'description': pocket_article[1]['excerpt'], 'sort': pocket_article[1]['sort_id'] @@ -126,49 +126,6 @@ class Pocket(BasicNewsRecipe): self.articles = sorted(self.articles, key=operator.itemgetter('sort')) return [("My Pocket Articles for {0}".format(strftime('[%I:%M %p]')), self.articles)] - def get_textview(self, url): - """ - Since Pocket's v3 API they removed access to textview. They also - redesigned their page to make it much harder to scrape their textview. - We need to pull the article, retrieve the formcheck id, then use it - to querty for the json version - This function will break when pocket hates us - """ - ajax_url = self.index_url + u'/a/x/getArticle.php' - soup = self.index_to_soup(url) - fc_tag = soup.find('script', text=re.compile("formCheck")) - fc_id = re.search(r"formCheck = \'([\d\w]+)\';", fc_tag).group(1) - article_id = url.split("/")[-1] - data = urlencode({'itemId': article_id, 'formCheck': fc_id}) - try: - response = self.browser.open(ajax_url, data) - except HTTPError as e: - self.log.exception("unable to get textview {0}".format(e.info())) - raise e - return json.load(response)['article'] - - def get_obfuscated_article(self, url): - """ - Our get_textview returns parsed json so prettify it to something well - parsed by calibre. - """ - article = self.get_textview(url) - template = Template('

$title

$body
') - with tempfile.NamedTemporaryFile(delete=False) as tf: - tmpbody = article['article'] - for img in article['images']: - imgdiv = '
'.format( - article['images'][img]['image_id']) - imgtag = r''.format( - article['images'][img]['src']) - tmpbody = tmpbody.replace(imgdiv, imgtag) - - tf.write(template.safe_substitute( - title=article['title'], - body=tmpbody - )) - return tf.name - def mark_as_read(self, mark_list): actions_list = [] for article_id in mark_list: