mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge branch 'fix-scmp' of https://github.com/ping/calibre
This commit is contained in:
commit
7ceb4caa8c
@ -106,10 +106,11 @@ class SCMP(BasicNewsRecipe):
|
|||||||
caption_text = child.get("attribs", {}).get("alt") or child.get(
|
caption_text = child.get("attribs", {}).get("alt") or child.get(
|
||||||
"attribs", {}
|
"attribs", {}
|
||||||
).get("title")
|
).get("title")
|
||||||
caption_tag = soup.new_tag("span")
|
if caption_text:
|
||||||
caption_tag.string = caption_text
|
new_ele = soup.new_tag("span")
|
||||||
caption_tag["class"] = "caption"
|
new_ele.append(caption_text)
|
||||||
child_html += str(caption_tag)
|
new_ele["class"] = "caption"
|
||||||
|
child_html += str(new_ele)
|
||||||
ele["class"] = "article-img"
|
ele["class"] = "article-img"
|
||||||
ele.append(BeautifulSoup(child_html))
|
ele.append(BeautifulSoup(child_html))
|
||||||
|
|
||||||
@ -118,15 +119,20 @@ class SCMP(BasicNewsRecipe):
|
|||||||
soup = BeautifulSoup(raw_html)
|
soup = BeautifulSoup(raw_html)
|
||||||
|
|
||||||
for script in soup.find_all("script"):
|
for script in soup.find_all("script"):
|
||||||
if not script.text.startswith("window.__APOLLO_STATE__"):
|
if not script.contents:
|
||||||
|
continue
|
||||||
|
if not script.contents[0].startswith("window.__APOLLO_STATE__"):
|
||||||
continue
|
continue
|
||||||
article_js = re.sub(
|
article_js = re.sub(
|
||||||
r"window.__APOLLO_STATE__\s*=\s*", "", script.text.strip()
|
r"window.__APOLLO_STATE__\s*=\s*", "", script.contents[0].strip()
|
||||||
)
|
)
|
||||||
if article_js.endswith(";"):
|
if article_js.endswith(";"):
|
||||||
article_js = article_js[:-1]
|
article_js = article_js[:-1]
|
||||||
article = json.loads(article_js)
|
try:
|
||||||
break
|
article = json.loads(article_js)
|
||||||
|
break
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
self.log.exception("Unable to parse __APOLLO_STATE__")
|
||||||
|
|
||||||
if not (article and article.get("contentService")):
|
if not (article and article.get("contentService")):
|
||||||
# Sometimes the page does not have article content in the <script>
|
# Sometimes the page does not have article content in the <script>
|
||||||
|
Loading…
x
Reference in New Issue
Block a user