mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-07 10:14:46 -04:00
Fix Pocket/readitlater recipe
This recipe wasn't working (at least for me), where it would consistently fail with the following error when trying to download an article: ``` Traceback (most recent call last): File "calibre/utils/threadpool.py", line 100, in run File "calibre/web/feeds/news.py", line 1186, in fetch_obfuscated_article File "<string>", line 157, in get_obfuscated_article File "<string>", line 142, in get_textview File "re.py", line 201, in search TypeError: expected string or bytes-like object ``` I believe this is because Pocket don't allow access to their "Article View" API by default: https://getpocket.com/developer/docs/v3/article-view This change uses the original URL of the article, rather than the pocket url for it (those `getpocket.com/a/read/<id>` URLs in the browser seem to just redirect me to `getpocket.com/my-list`). I've a feeling that the old way might have produced cleaner articles (Pocket cleanup & then Calibre cleanup), but I've never seen it work successfully. We could alternatively try to convince Pocket to enable that API for the "app" we use here (I think this usage would qualify, as it's a "Pocket specific feature" (for Calibre)). That might require adopting their full OAuth flow, rather than using username/password. From reading their API docs with Calibre in mind, I think that would mean we'd have to have a web page somewhere that the user would have to access in their normal web browser, click a button to redirect them to the Pocket page to authorize the Calibre app to access their Pocket account, which would then redirect them back to our web page where we'd instruct them to enter the access token into the "password" field for the recipe in Calibre.
This commit is contained in:
parent
2a45519e5d
commit
6590bcd76e
@ -1,5 +1,5 @@
|
||||
"""
|
||||
Pocket Calibre Recipe v1.4
|
||||
Pocket Calibre Recipe v1.5
|
||||
"""
|
||||
from calibre import strftime
|
||||
from calibre.web.feeds.news import BasicNewsRecipe
|
||||
@ -39,7 +39,7 @@ class Pocket(BasicNewsRecipe):
|
||||
# Settings people change
|
||||
oldest_article = 7.0
|
||||
max_articles_per_feed = 50
|
||||
minimum_articles = 10
|
||||
minimum_articles = 1
|
||||
mark_as_read_after_dl = True # Set this to False for testing
|
||||
sort_method = 'oldest' # MUST be either 'oldest' or 'newest'
|
||||
# To filter by tag this needs to be a single tag in quotes; IE 'calibre'
|
||||
@ -49,7 +49,7 @@ class Pocket(BasicNewsRecipe):
|
||||
no_stylesheets = True
|
||||
use_embedded_content = False
|
||||
needs_subscription = True
|
||||
articles_are_obfuscated = True
|
||||
articles_are_obfuscated = False
|
||||
apikey = '19eg0e47pbT32z4793Tf021k99Afl889'
|
||||
index_url = u'https://getpocket.com'
|
||||
read_api_url = index_url + u'/v3/get'
|
||||
@ -118,7 +118,7 @@ class Pocket(BasicNewsRecipe):
|
||||
'item_id': pocket_article[0],
|
||||
'title': pocket_article[1]['resolved_title'],
|
||||
'date': pocket_article[1]['time_updated'],
|
||||
'url': u'{0}/a/read/{1}'.format(self.index_url, pocket_article[0]),
|
||||
'url': pocket_article[1]['resolved_url'],
|
||||
'real_url': pocket_article[1]['resolved_url'],
|
||||
'description': pocket_article[1]['excerpt'],
|
||||
'sort': pocket_article[1]['sort_id']
|
||||
@ -126,49 +126,6 @@ class Pocket(BasicNewsRecipe):
|
||||
self.articles = sorted(self.articles, key=operator.itemgetter('sort'))
|
||||
return [("My Pocket Articles for {0}".format(strftime('[%I:%M %p]')), self.articles)]
|
||||
|
||||
def get_textview(self, url):
|
||||
"""
|
||||
Since Pocket's v3 API they removed access to textview. They also
|
||||
redesigned their page to make it much harder to scrape their textview.
|
||||
We need to pull the article, retrieve the formcheck id, then use it
|
||||
to querty for the json version
|
||||
This function will break when pocket hates us
|
||||
"""
|
||||
ajax_url = self.index_url + u'/a/x/getArticle.php'
|
||||
soup = self.index_to_soup(url)
|
||||
fc_tag = soup.find('script', text=re.compile("formCheck"))
|
||||
fc_id = re.search(r"formCheck = \'([\d\w]+)\';", fc_tag).group(1)
|
||||
article_id = url.split("/")[-1]
|
||||
data = urlencode({'itemId': article_id, 'formCheck': fc_id})
|
||||
try:
|
||||
response = self.browser.open(ajax_url, data)
|
||||
except HTTPError as e:
|
||||
self.log.exception("unable to get textview {0}".format(e.info()))
|
||||
raise e
|
||||
return json.load(response)['article']
|
||||
|
||||
def get_obfuscated_article(self, url):
|
||||
"""
|
||||
Our get_textview returns parsed json so prettify it to something well
|
||||
parsed by calibre.
|
||||
"""
|
||||
article = self.get_textview(url)
|
||||
template = Template('<h1>$title</h1><div class="body">$body</div>')
|
||||
with tempfile.NamedTemporaryFile(delete=False) as tf:
|
||||
tmpbody = article['article']
|
||||
for img in article['images']:
|
||||
imgdiv = '<div id="RIL_IMG_{0}" class="RIL_IMG"></div>'.format(
|
||||
article['images'][img]['image_id'])
|
||||
imgtag = r'<img src="{0}" \>'.format(
|
||||
article['images'][img]['src'])
|
||||
tmpbody = tmpbody.replace(imgdiv, imgtag)
|
||||
|
||||
tf.write(template.safe_substitute(
|
||||
title=article['title'],
|
||||
body=tmpbody
|
||||
))
|
||||
return tf.name
|
||||
|
||||
def mark_as_read(self, mark_list):
|
||||
actions_list = []
|
||||
for article_id in mark_list:
|
||||
|
Loading…
x
Reference in New Issue
Block a user