mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-09-29 15:31:08 -04:00
136 lines
5.1 KiB
Python
136 lines
5.1 KiB
Python
import re
|
|
from collections import OrderedDict
|
|
from urllib.parse import urlparse
|
|
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
|
|
_issue_url = ""
|
|
|
|
COMMA_SEP_RE = re.compile(r"\s*,\s*")
|
|
SPACE_SEP_RE = re.compile(r"\s+")
|
|
NON_NUMERIC_RE = re.compile(r"[^\d]+")
|
|
|
|
|
|
class Poetry(BasicNewsRecipe):
|
|
title = "Poetry Magazine"
|
|
__author__ = "ping"
|
|
description = (
|
|
"Founded in Chicago by Harriet Monroe in 1912, Poetry is the oldest monthly "
|
|
"devoted to verse in the English-speaking world. https://www.poetryfoundation.org/poetrymagazine"
|
|
)
|
|
publication_type = "magazine"
|
|
language = "en"
|
|
encoding = "utf-8"
|
|
remove_javascript = True
|
|
no_stylesheets = True
|
|
auto_cleanup = False
|
|
ignore_duplicate_articles = {"url"}
|
|
compress_news_images = False
|
|
|
|
remove_attributes = ["style", "font"]
|
|
keep_only_tags = [dict(name="article")]
|
|
|
|
remove_tags = [
|
|
dict(name="button"),
|
|
dict(
|
|
attrs={
|
|
"class": [
|
|
"c-socialBlocks",
|
|
"c-index",
|
|
"o-stereo",
|
|
"u-hideAboveSmall",
|
|
"c-slideTrigger",
|
|
"js-slideshow",
|
|
]
|
|
}
|
|
),
|
|
]
|
|
|
|
extra_css = """
|
|
h1 { font-size: 1.8rem; margin-bottom: 0.5rem; }
|
|
.o-titleBar-summary { font-size: 1.2rem; font-style: italic; margin-bottom: 1rem; }
|
|
div.o-titleBar-meta, div.c-feature-sub { font-weight: bold; color: #444; margin-bottom: 1.5rem; }
|
|
div.pcms_media img, div.o-mediaEnclosure img { max-width: 100%; height: auto; }
|
|
div.o-mediaEnclosure .o-mediaEnclosure-metadata { font-size: 0.8rem; margin-top: 0.2rem; }
|
|
div.c-feature-bd { margin-bottom: 2rem; }
|
|
div.c-auxContent { color: #222; font-size: 0.85rem; margin-top: 2rem; }
|
|
"""
|
|
|
|
def extract_from_img_srcset(self, srcset: str, max_width=0):
|
|
sources = [s.strip() for s in COMMA_SEP_RE.split(srcset) if s.strip()]
|
|
if len(sources) == 1:
|
|
# just a regular img url probably
|
|
return sources[0]
|
|
parsed_sources = []
|
|
for src in sources:
|
|
src_n_width = [s.strip() for s in SPACE_SEP_RE.split(src) if s.strip()]
|
|
if len(src_n_width) != 2:
|
|
raise ValueError(f"Not a valid srcset: {srcset}")
|
|
parsed_sources.append(
|
|
(
|
|
src_n_width[0].strip(),
|
|
int(NON_NUMERIC_RE.sub("", src_n_width[1].strip())),
|
|
)
|
|
)
|
|
parsed_sources = list(set(parsed_sources))
|
|
parsed_sources = sorted(parsed_sources, key=lambda x: x[1], reverse=True)
|
|
if not max_width:
|
|
return parsed_sources[0][0]
|
|
for img, width in parsed_sources:
|
|
if width <= max_width:
|
|
return img
|
|
return parsed_sources[-1][0]
|
|
|
|
def preprocess_html(self, soup):
|
|
for img in soup.select("div.o-mediaEnclosure img"):
|
|
if not img.get("srcset"):
|
|
continue
|
|
img["src"] = self.extract_from_img_srcset(img["srcset"], max_width=1000)
|
|
return soup
|
|
|
|
def parse_index(self):
|
|
if _issue_url:
|
|
soup = self.index_to_soup(_issue_url)
|
|
else:
|
|
soup = self.index_to_soup("https://www.poetryfoundation.org/poetrymagazine")
|
|
current_issue = soup.select("div.c-cover-media a")
|
|
if not current_issue:
|
|
self.abort_recipe_processing("Unable to find latest issue")
|
|
current_issue = current_issue[0]
|
|
soup = self.index_to_soup(current_issue["href"])
|
|
|
|
issue_edition = self.tag_to_string(soup.find("h1"))
|
|
self.timefmt = f" [{issue_edition}]"
|
|
cover_image = soup.select("div.c-issueBillboard-cover-media img")[0]
|
|
parsed_cover_url = urlparse(
|
|
cover_image["srcset"].split(",")[-1].strip().split(" ")[0]
|
|
)
|
|
self.cover_url = f"{parsed_cover_url.scheme}://{parsed_cover_url.netloc}{parsed_cover_url.path}"
|
|
|
|
sectioned_feeds = OrderedDict()
|
|
|
|
tabs = soup.find_all("div", attrs={"class": "c-tier_tabbed"})
|
|
for tab in tabs:
|
|
tab_title = tab.find("div", attrs={"class": "c-tier-tab"})
|
|
tab_content = tab.find("div", attrs={"class": "c-tier-content"})
|
|
if not (tab_title and tab_content):
|
|
continue
|
|
tab_title = self.tag_to_string(tab_title)
|
|
sectioned_feeds[tab_title] = []
|
|
for li in tab_content.select("ul.o-blocks > li"):
|
|
author = self.tag_to_string(
|
|
li.find("span", attrs={"class": "c-txt_attribution"})
|
|
)
|
|
for link in li.find_all("a", attrs={"class": "c-txt_abstract"}):
|
|
self.log("Found article:", self.tag_to_string(link))
|
|
sectioned_feeds[tab_title].append(
|
|
{
|
|
"title": self.tag_to_string(link),
|
|
"url": link["href"],
|
|
"author": author,
|
|
"description": author,
|
|
}
|
|
)
|
|
|
|
return sectioned_feeds.items()
|