Fix Pocket/readitlater recipe

This recipe wasn't working (at least for me), where it would
consistently fail with the following error when trying to download an
article:

```
Traceback (most recent call last):
  File "calibre/utils/threadpool.py", line 100, in run
  File "calibre/web/feeds/news.py", line 1186, in fetch_obfuscated_article
  File "<string>", line 157, in get_obfuscated_article
  File "<string>", line 142, in get_textview
  File "re.py", line 201, in search
TypeError: expected string or bytes-like object
```

I believe this is because Pocket don't allow access to their
"Article View" API by default:
https://getpocket.com/developer/docs/v3/article-view

This change uses the original URL of the article, rather than the
pocket url for it (those `getpocket.com/a/read/<id>` URLs in the
browser seem to just redirect me to `getpocket.com/my-list`). I've a
feeling that the old way might have produced cleaner articles (Pocket
cleanup & then Calibre cleanup), but I've never seen it work
successfully.

We could alternatively try to convince Pocket to enable that API for
the "app" we use here (I think this usage would qualify, as it's a
"Pocket specific feature" (for Calibre)). That might require adopting
their full OAuth flow, rather than using username/password. From
reading their API docs with Calibre in mind, I think that would mean
we'd have to have a web page somewhere that the user would have to
access in their normal web browser, click a button to redirect them to
the Pocket page to authorize the Calibre app to access their Pocket
account, which would then redirect them back to our web page where
we'd instruct them to enter the access token into the "password" field
for the recipe in Calibre.
This commit is contained in:
Gerard Ryan 2021-12-11 22:52:20 +00:00
parent 2a45519e5d
commit 6590bcd76e
No known key found for this signature in database
GPG Key ID: 4E00938BDBFCA05B

View File

@ -1,5 +1,5 @@
"""
Pocket Calibre Recipe v1.4
Pocket Calibre Recipe v1.5
"""
from calibre import strftime
from calibre.web.feeds.news import BasicNewsRecipe
@ -39,7 +39,7 @@ class Pocket(BasicNewsRecipe):
# Settings people change
oldest_article = 7.0
max_articles_per_feed = 50
minimum_articles = 10
minimum_articles = 1
mark_as_read_after_dl = True # Set this to False for testing
sort_method = 'oldest' # MUST be either 'oldest' or 'newest'
# To filter by tag this needs to be a single tag in quotes; IE 'calibre'
@ -49,7 +49,7 @@ class Pocket(BasicNewsRecipe):
no_stylesheets = True
use_embedded_content = False
needs_subscription = True
articles_are_obfuscated = True
articles_are_obfuscated = False
apikey = '19eg0e47pbT32z4793Tf021k99Afl889'
index_url = u'https://getpocket.com'
read_api_url = index_url + u'/v3/get'
@ -118,7 +118,7 @@ class Pocket(BasicNewsRecipe):
'item_id': pocket_article[0],
'title': pocket_article[1]['resolved_title'],
'date': pocket_article[1]['time_updated'],
'url': u'{0}/a/read/{1}'.format(self.index_url, pocket_article[0]),
'url': pocket_article[1]['resolved_url'],
'real_url': pocket_article[1]['resolved_url'],
'description': pocket_article[1]['excerpt'],
'sort': pocket_article[1]['sort_id']
@ -126,49 +126,6 @@ class Pocket(BasicNewsRecipe):
self.articles = sorted(self.articles, key=operator.itemgetter('sort'))
return [("My Pocket Articles for {0}".format(strftime('[%I:%M %p]')), self.articles)]
def get_textview(self, url):
"""
Since Pocket's v3 API they removed access to textview. They also
redesigned their page to make it much harder to scrape their textview.
We need to pull the article, retrieve the formcheck id, then use it
to querty for the json version
This function will break when pocket hates us
"""
ajax_url = self.index_url + u'/a/x/getArticle.php'
soup = self.index_to_soup(url)
fc_tag = soup.find('script', text=re.compile("formCheck"))
fc_id = re.search(r"formCheck = \'([\d\w]+)\';", fc_tag).group(1)
article_id = url.split("/")[-1]
data = urlencode({'itemId': article_id, 'formCheck': fc_id})
try:
response = self.browser.open(ajax_url, data)
except HTTPError as e:
self.log.exception("unable to get textview {0}".format(e.info()))
raise e
return json.load(response)['article']
def get_obfuscated_article(self, url):
"""
Our get_textview returns parsed json so prettify it to something well
parsed by calibre.
"""
article = self.get_textview(url)
template = Template('<h1>$title</h1><div class="body">$body</div>')
with tempfile.NamedTemporaryFile(delete=False) as tf:
tmpbody = article['article']
for img in article['images']:
imgdiv = '<div id="RIL_IMG_{0}" class="RIL_IMG"></div>'.format(
article['images'][img]['image_id'])
imgtag = r'<img src="{0}" \>'.format(
article['images'][img]['src'])
tmpbody = tmpbody.replace(imgdiv, imgtag)
tf.write(template.safe_substitute(
title=article['title'],
body=tmpbody
))
return tf.name
def mark_as_read(self, mark_list):
actions_list = []
for article_id in mark_list: