diff --git a/recipes/harpers.recipe b/recipes/harpers.recipe index 6f0d8497d5..e5ca7e3b7b 100644 --- a/recipes/harpers.recipe +++ b/recipes/harpers.recipe @@ -1,88 +1,91 @@ -__license__ = 'GPL v3' -__copyright__ = '2008-2012, Darko Miletic ' ''' harpers.org ''' -from calibre.web.feeds.news import BasicNewsRecipe +from calibre.web.feeds.news import BasicNewsRecipe, classes +from calibre import browser class Harpers(BasicNewsRecipe): - title = u"Harper's Magazine" - __author__ = u'Darko Miletic' - language = 'en' - description = u"Harper's Magazine: Founded June 1850." + title = 'Harper’s Magazine' + __author__ = 'unkn0wn' + language = 'en_US' + description = ( + 'Harper’s Magazine, the oldest general-interest monthly in America, explores the issues that drive our ' + 'national conversation, through long-form narrative journalism and essays, and such celebrated ' + 'features as the iconic Harper’s Index. With its emphasis on fine writing and original thought ' + 'Harper’s provides readers with a unique perspective on politics, society, the environment, and culture.' + ) publisher = "Harper's Magazine " category = 'news, politics, USA' - oldest_article = 30 - max_articles_per_feed = 100 no_stylesheets = True use_embedded_content = False - - conversion_options = { - 'comment': description, 'tags': category, 'publisher': publisher, 'language': language - } + masthead_url = 'https://harpers.org/wp-content/themes/timber/assets/img/logo.svg' + ignore_duplicate_articles = {'url'} + encoding = 'utf-8' + remove_attributes = ['style', 'height', 'width'] keep_only_tags = [ - dict( - class_=[ - "article-content", - "template-index-archive", # harper's index - ] - ) + dict(attrs={'class':lambda x: x and ( + 'title-header desktop ' in x or 'col-md-8 col-xl-9' in x + )}), + classes('article-hero-img entry-content pdf-only') ] remove_tags = [ - dict( - class_=[ - "component-newsletter-signup", - "sidebar", - "header-meta", - "component-from-author", - "from-issue", - "d-none", - "COA_roles_fix_space", - "section-tags", - "aria-font-adjusts", - "component-share-buttons", - "index-footer", - "index-prev-link", - "comma", - ] - ), - # for harper's index - dict( - class_=[ - "aria-font-adjusts", - "component-share-buttons", - "index-footer", - "index-prev-link", - ] - ), + classes('header-controls') ] remove_attributes = ["style", "width", "height"] - extra_css = """ - h1.article-title { font-size: x-large; margin-bottom: 0.4rem; } - .subheading, .post-subtitle { font-size: large; font-style: italic; margin-bottom: 1rem; } - .byline { margin-bottom: 1rem } - .article-hero-img img, .flex-section-image img, .wp-caption img { - display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto; - box-sizing: border-box; - } - .wp-caption-text { font-size: small; margin-top: 0.3rem; } + extra_css = ''' + img {display:block; margin:0 auto;} + .category, .from-issue { font-size:small; color:#404040; } + .wp-caption-text { font-size:small; text-align:center; } + .subheading { font-style:italic; color:#202020; } + .byline { font-size:small; } + em, blockquote { color:#202020; } + ''' - .author-bio { margin-top: 2.5rem; font-style: italic; } - .author-bio em { font-weight: bold; } + def preprocess_html(self, soup): + sub = soup.find(attrs={'class':'subheading'}) + if sub: + sub.name = 'p' + for img in soup.findAll('img', attrs={'srcset':True}): + for src in img['srcset'].split(','): + if '768w' in src: + img['src'] = img['src'].split()[0] + return soup - .index-item { font-size: large; margin: 1rem 0; } - .index-statement > p { display: inline-block; margin: 0.5rem 0; } - .index-statement > span { display: inline-block; } - .index-statement .index-tooltip { font-size: small; } - """ - - def get_cover_url(self): + def parse_index(self): issues_soup = self.index_to_soup("https://harpers.org/issues/") - curr_issue_a_ele = issues_soup.select_one("div.issue-card a") - if curr_issue_a_ele.find("img"): - return curr_issue_a_ele.img["src"] + a_ele = issues_soup.select_one("div.issue-card a") + self.timefmt = ' [' + self.tag_to_string(a_ele.find(attrs={'class':'issue-title'})) + ']' + url = a_ele['href'] + soup = self.index_to_soup(url) + cov_div = soup.find('div', attrs={'class':'issue-cover'}) + if cov_div: + self.cover_url = cov_div.find('img', attrs={'class':'cover-img'})['src'] + ans = [] + for a in soup.findAll('a', attrs={'href':lambda x: x and x.startswith(url + '/')}): + if not a.find('img') and a.find(['h1', 'h2', 'h3', 'h4']): + url = a['href'] + title = self.tag_to_string(a) + desc = '' + div = a.findParent('div').find('div', attrs={'class':'byline'}) + if div: + desc = self.tag_to_string(div) + self.log('\t', title, '\n\t', desc, '\n\t', url) + ans.append({'title': title, 'description': desc, 'url': url}) + return [('Articles', ans)] - feeds = [(u"Harper's Magazine", u'https://harpers.org/feed/')] + # Harpers changes the content it delivers based on cookies, so the + # following ensures that we send no cookies + def get_browser(self, *args, **kwargs): + return self + + def clone_browser(self, *args, **kwargs): + return self.get_browser() + + def open_novisit(self, *args, **kwargs): + br = browser() + return br.open_novisit(*args, **kwargs) + + open = open_novisit diff --git a/recipes/harpers_full.recipe b/recipes/harpers_full.recipe deleted file mode 100644 index 6a1eb2d72d..0000000000 --- a/recipes/harpers_full.recipe +++ /dev/null @@ -1,170 +0,0 @@ -# -*- mode: python -*- -# -*- coding: utf-8 -*- -# vi: set fenc=utf-8 ft=python : -# kate: encoding utf-8; syntax python; - -__license__ = "GPL v3" -__copyright__ = "2008-2019, Darko Miletic " -""" -harpers.org - printed issue articles -This recipe only get's article's published in text format -images and pdf's are ignored -""" - -from calibre import browser -from calibre.web.feeds.news import BasicNewsRecipe - -# overwrite this with a custom issue url, e.g. https://harpers.org/archive/2023/01/ -_issue_url = "" - - -class Harpers_full(BasicNewsRecipe): - title = "Harper's Magazine - articles from printed edition" - __author__ = "Darko Miletic, updated by ping" - description = "Harper's Magazine, the oldest general-interest monthly in America, explores the issues that drive our national conversation, through long-form narrative journalism and essays, and such celebrated features as the iconic Harper's Index." # noqa - publisher = "Harpers's" - category = "news, politics, USA" - oldest_article = 31 - max_articles_per_feed = 100 - no_stylesheets = True - use_embedded_content = False - language = "en" - encoding = "utf8" - publication_type = "magazine" - requires_version = (5, 0, 0) # py3 - ignore_duplicate_articles = {"url"} - base_url = "https://harpers.org" - - keep_only_tags = [ - dict( - class_=[ - "article-content", - "template-index-archive", # harper's index - ] - ) - ] - remove_tags = [ - dict( - class_=[ - "component-newsletter-signup", - "sidebar", - "header-meta", - "component-from-author", - "from-issue", - "d-none", - "COA_roles_fix_space", - "section-tags", - "aria-font-adjusts", - "component-share-buttons", - "index-footer", - "index-prev-link", - "comma", - ] - ), - # for harper's index - dict( - class_=[ - "aria-font-adjusts", - "component-share-buttons", - "index-footer", - "index-prev-link", - ] - ), - ] - remove_attributes = ["style", "width", "height"] - - extra_css = """ - h1.article-title { font-size: x-large; margin-bottom: 0.4rem; } - .subheading, .post-subtitle { font-size: large; font-style: italic; margin-bottom: 1rem; } - .byline { margin-bottom: 1rem } - .article-hero-img img, .flex-section-image img, .wp-caption img { - display: block; margin-bottom: 0.3rem; max-width: 100%; height: auto; - box-sizing: border-box; - } - .wp-caption-text { font-size: small; margin-top: 0.3rem; } - - .author-bio { margin-top: 2.5rem; font-style: italic; } - .author-bio em { font-weight: bold; } - - .index-item { font-size: large; margin: 1rem 0; } - .index-statement > p { display: inline-block; margin: 0.5rem 0; } - .index-statement > span { display: inline-block; } - .index-statement .index-tooltip { font-size: small; } - """ - - # Send cookie-less requests to get full article - def get_browser(self, *args, **kwargs): - return self - - def clone_browser(self, *args, **kwargs): - return self.get_browser() - - def open_novisit(self, *args, **kwargs): - br = browser() - return br.open_novisit(*args, **kwargs) - - open = open_novisit - - def preprocess_html(self, soup): - # General UI tweaks - # move subheading to before byline (instead of where it is now, after) - subheading_ele = soup.find(class_="subheading") - byline_ele = soup.find(class_="byline") - if byline_ele and subheading_ele: - byline_ele.insert_before(subheading_ele.extract()) - - # strip extraneous stuff from author bio - for bio in soup.find_all(class_="author-bio"): - for dec_ele in bio.find_all("br"): - dec_ele.decompose() - for unwrap_ele in bio.find_all("p"): - unwrap_ele.unwrap() - - # remove extraneous hr - for hr in soup.select(".after-post-content hr"): - hr.decompose() - return soup - - def parse_index(self): - if not _issue_url: - issues_soup = self.index_to_soup("https://harpers.org/issues/") - curr_issue_a_ele = issues_soup.select_one("div.issue-card a") - if curr_issue_a_ele.find("img"): - self.cover_url = curr_issue_a_ele.img["src"] - else: - curr_issue_url = _issue_url - - soup = self.index_to_soup(curr_issue_url) - self.timefmt = ( - f' [{self.tag_to_string(soup.find("h1", class_="issue-heading")).strip()}]' - ) - self.cover_url = soup.find("img", class_="cover-img")["src"] - - articles = {} - for section_name in ("features", "readings", "articles"): - section = soup.find("section", class_=f"issue-{section_name}") - if not section: - continue - for card in section.find_all("div", class_="article-card"): - title_ele = card.find(class_="ac-title") - if not title_ele: - continue - article_url = card.find("a")["href"] - article_title = self.tag_to_string(title_ele) - article_description = ( - f'{self.tag_to_string(card.find(class_="ac-tax"))} ' - f'{self.tag_to_string(card.find(class_="ac-subtitle"))}' - ).strip() - byline = card.find(class_="byline") - if byline: - article_description += ( - f' {self.tag_to_string(byline).strip().strip(",")}' - ) - articles.setdefault(section_name.title(), []).append( - { - "url": article_url, - "title": article_title, - "description": article_description, - } - ) - return articles.items() diff --git a/recipes/icons/harpers.png b/recipes/icons/harpers.png index 1e37988d7c..268454be1e 100644 Binary files a/recipes/icons/harpers.png and b/recipes/icons/harpers.png differ diff --git a/recipes/icons/harpers_full.png b/recipes/icons/harpers_full.png deleted file mode 100644 index 1e37988d7c..0000000000 Binary files a/recipes/icons/harpers_full.png and /dev/null differ