calibre/recipes/poetrymagazine.recipe

import re
from collections import OrderedDict
from urllib.parse import urlparse

from calibre.web.feeds.news import BasicNewsRecipe

_issue_url = ""

COMMA_SEP_RE = re.compile(r"\s*,\s*")
SPACE_SEP_RE = re.compile(r"\s+")
NON_NUMERIC_RE = re.compile(r"[^\d]+")


class Poetry(BasicNewsRecipe):
    title = "Poetry Magazine"
    __author__ = "ping"
    description = (
        "Founded in Chicago by Harriet Monroe in 1912, Poetry is the oldest monthly "
        "devoted to verse in the English-speaking world. https://www.poetryfoundation.org/poetrymagazine"
    )
    publication_type = "magazine"
    language = "en"
    encoding = "utf-8"
    remove_javascript = True
    no_stylesheets = True
    auto_cleanup = False
    ignore_duplicate_articles = {"url"}
    compress_news_images = False

    remove_attributes = ["style", "font"]
    keep_only_tags = [dict(name="article")]

    remove_tags = [
        dict(name="button"),
        dict(
            attrs={
                "class": [
                    "c-socialBlocks",
                    "c-index",
                    "o-stereo",
                    "u-hideAboveSmall",
                    "c-slideTrigger",
                    "js-slideshow",
                ]
            }
        ),
    ]

    extra_css = """
    h1 { font-size: 1.8rem; margin-bottom: 0.5rem; }
    .o-titleBar-summary { font-size: 1.2rem; font-style: italic; margin-bottom: 1rem; }
    div.o-titleBar-meta, div.c-feature-sub { font-weight: bold; color: #444; margin-bottom: 1.5rem; }
    div.pcms_media img, div.o-mediaEnclosure img { max-width: 100%; height: auto; }
    div.o-mediaEnclosure .o-mediaEnclosure-metadata { font-size: 0.8rem; margin-top: 0.2rem; }
    div.c-feature-bd { margin-bottom: 2rem; }
    div.c-auxContent { color: #222; font-size: 0.85rem; margin-top: 2rem; }
    """

    def extract_from_img_srcset(self, srcset: str, max_width=0):
        sources = [s.strip() for s in COMMA_SEP_RE.split(srcset) if s.strip()]
        if len(sources) == 1:
            # just a regular img url probably
            return sources[0]
        parsed_sources = []
        for src in sources:
            src_n_width = [s.strip() for s in SPACE_SEP_RE.split(src) if s.strip()]
            if len(src_n_width) != 2:
                raise ValueError(f"Not a valid srcset: {srcset}")
            parsed_sources.append(
                (
                    src_n_width[0].strip(),
                    int(NON_NUMERIC_RE.sub("", src_n_width[1].strip())),
                )
            )
        parsed_sources = list(set(parsed_sources))
        parsed_sources = sorted(parsed_sources, key=lambda x: x[1], reverse=True)
        if not max_width:
            return parsed_sources[0][0]
        for img, width in parsed_sources:
            if width <= max_width:
                return img
        return parsed_sources[-1][0]

    def preprocess_html(self, soup):
        for img in soup.select("div.o-mediaEnclosure img"):
            if not img.get("srcset"):
                continue
            img["src"] = self.extract_from_img_srcset(img["srcset"], max_width=1000)
        return soup

    def parse_index(self):
        if _issue_url:
            soup = self.index_to_soup(_issue_url)
        else:
            soup = self.index_to_soup("https://www.poetryfoundation.org/poetrymagazine")
            current_issue = soup.select("div.c-cover-media a")
            if not current_issue:
                self.abort_recipe_processing("Unable to find latest issue")
            current_issue = current_issue[0]
            soup = self.index_to_soup(current_issue["href"])

        issue_edition = self.tag_to_string(soup.find("h1"))
        self.timefmt = f" [{issue_edition}]"
        cover_image = soup.select("div.c-issueBillboard-cover-media img")[0]
        parsed_cover_url = urlparse(
            cover_image["srcset"].split(",")[-1].strip().split(" ")[0]
        )
        self.cover_url = f"{parsed_cover_url.scheme}://{parsed_cover_url.netloc}{parsed_cover_url.path}"

        sectioned_feeds = OrderedDict()

        tabs = soup.find_all("div", attrs={"class": "c-tier_tabbed"})
        for tab in tabs:
            tab_title = tab.find("div", attrs={"class": "c-tier-tab"})
            tab_content = tab.find("div", attrs={"class": "c-tier-content"})
            if not (tab_title and tab_content):
                continue
            tab_title = self.tag_to_string(tab_title)
            sectioned_feeds[tab_title] = []
            for li in tab_content.select("ul.o-blocks > li"):
                author = self.tag_to_string(
                    li.find("span", attrs={"class": "c-txt_attribution"})
                )
                for link in li.find_all("a", attrs={"class": "c-txt_abstract"}):
                    self.log("Found article:", self.tag_to_string(link))
                    sectioned_feeds[tab_title].append(
                        {
                            "title": self.tag_to_string(link),
                            "url": link["href"],
                            "author": author,
                            "description": author,
                        }
                    )

        return sectioned_feeds.items()