From 91c0aa0b1ff954adb9195fa4b452404ff81d1b3a Mon Sep 17 00:00:00 2001 From: unkn0w7n <51942695+unkn0w7n@users.noreply.github.com> Date: Sun, 6 Oct 2024 12:19:34 +0530 Subject: [PATCH] natgeo --- recipes/natgeo.recipe | 182 ++++----------------- recipes/natgeohis.recipe | 178 +++------------------ recipes/natgeomag.recipe | 212 ++++++------------------- src/calibre/gui2/dialogs/trim_image.py | 8 +- src/calibre/web/site_parsers/natgeo.py | 156 ++++++++++++++++++ 5 files changed, 259 insertions(+), 477 deletions(-) create mode 100644 src/calibre/web/site_parsers/natgeo.py diff --git a/recipes/natgeo.recipe b/recipes/natgeo.recipe index bb403978e3..3c46c7ae35 100644 --- a/recipes/natgeo.recipe +++ b/recipes/natgeo.recipe @@ -1,151 +1,11 @@ #!/usr/bin/env python # vim:fileencoding=utf-8 -from __future__ import absolute_import, division, print_function, unicode_literals - -import json from pprint import pformat - -from calibre import prepare_string_for_xml as escape -from calibre.utils.iso8601 import parse_iso8601 -from calibre.web.feeds.news import BasicNewsRecipe - - -def classes(classes): - q = frozenset(classes.split(' ')) - return dict(attrs={ - 'class': lambda x: x and frozenset(x.split()).intersection(q)}) - - -def extract_json(raw): - s = raw.find("window['__natgeo__']") - script = raw[s:raw.find('', s)] - return json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle'] - - -def parse_contributors(grp): - for item in grp: - line = '
' + escape(item['title']) + ' ' - for c in item['contributors']: - line += escape(c['displayName']) - yield line + '
' - - -def parse_lead_image(media): - if 'image' in media: - yield '

' - if 'dsc' in media['image']: - yield '

{}
'.format( - escape(media['image']['src'], True), escape(media['image']['dsc'], True) - ) - else: - yield '
'.format(escape(media['image']['src'], True)) - if 'caption' in media and 'credit' in media: - yield '
' + media['caption'] + ' ' + media['credit'] + '
' - elif 'caption' in media: - yield '
' + media['caption'] + '
' - yield '

' - - -def parse_inline(inl): - if inl.get('content', {}).get('name', '') == 'Image': - props = inl['content']['props'] - yield '

' - if 'image' in props: - yield '

'.format(props['image']['src']) - if 'caption' in props: - yield '
{}{}
'.format( - props['caption'].get('text', ''), ' ' + props['caption'].get('credit', '') - ) - yield '

' - if inl.get('content', {}).get('name', '') == 'ImageGroup': - if 'images' in inl['content']['props']: - for imgs in inl['content']['props']['images']: - yield '

' - if 'src' in imgs: - yield '

'.format(imgs['src']) - if 'caption' in imgs: - yield '
{}{}
'.format( - imgs['caption'].get('text', ''), ' ' + imgs['caption'].get('credit', '') - ) - yield '

' - - -def parse_cont(content): - for cont in content.get('content', {}): - if isinstance(cont, dict): - yield from parse_body(cont) - if isinstance(cont, str): - yield cont - - -def parse_body(x): - if isinstance(x, dict): - if 'type' in x: - tag = x['type'] - if tag == 'inline': - yield ''.join(parse_inline(x)) - elif 'attrs' in x and 'href' in x.get('attrs', ''): - yield '<' + tag + ' href="{}">'.format(x['attrs']['href']) - for yld in parse_cont(x): - yield yld - yield '' - else: - yield '<' + tag + '>' - for yld in parse_cont(x): - yield yld - yield '' - elif isinstance(x, list): - for y in x: - if isinstance(y, dict): - yield from parse_body(y) - - -def parse_article(edg): - sc = edg['schma'] - yield '
' + escape(edg['sctn']) + '
' - yield '

' + escape(sc['sclTtl']) + '

' - yield '
' + escape(sc['sclDsc']) + '
' - yield '

' - for line in parse_contributors(edg.get('cntrbGrp', {})): - yield line - ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y') - yield '

Published: ' + escape(ts) + '
' - if 'readTime' in edg: - yield '
' + escape(edg['readTime']) + '
' - yield '

' - if edg.get('ldMda', {}).get('cmsType') == 'image': - for line in parse_lead_image(edg['ldMda']): - yield line - for main in edg['prismData']['mainComponents']: - if main['name'] == 'Body': - for item in main['props']['body']: - if isinstance(item, dict): - if item.get('type', '') == 'inline': - yield ''.join(parse_inline(item)) - elif isinstance(item, list): - for line in item: - yield ''.join(parse_body(line)) - - -def article_parse(data): - yield "" - for frm in data['frms']: - if not frm: - continue - for mod in frm.get('mods', ()): - for edg in mod.get('edgs', ()): - if edg.get('cmsType') == 'ImmersiveLeadTile': - if 'image' in edg.get('cmsImage', {}): - for line in parse_lead_image(edg['cmsImage']): - yield line - if edg.get('cmsType') == 'ArticleBodyTile': - for line in parse_article(edg): - yield line - yield "" +from calibre.web.feeds.news import BasicNewsRecipe, classes class NatGeo(BasicNewsRecipe): - title = u'National Geographic' + title = 'National Geographic' description = 'News articles from The National Geographic, Download Monthly.' language = 'en' encoding = 'utf8' @@ -167,26 +27,42 @@ class NatGeo(BasicNewsRecipe): 'res': { 'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500', 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.', - 'default': '600' + 'default': '600', } } - extra_css = ''' + @property + def natgeo_parser(self): + ans = getattr(self, '_natgeo_parser', None) + if ans is None: + from calibre.live import load_module + + self._natgeo_parser = ans = load_module('calibre.web.site_parsers.natgeo') + return ans + + def preprocess_raw_html(self, raw_html, url): + return self.natgeo_parser.extract_html(raw_html) + + extra_css = """ blockquote { color:#404040; } .byline, i { font-style:italic; color:#202020; } .cap { font-size:small; } img {display:block; margin:0 auto;} .cred { font-style:italic; font-size:small; color:#404040; } .auth, .time, .sub { font-size:small; color:#5c5c5c; } - ''' + """ def get_cover_url(self): # soup = self.index_to_soup('https://www.nationalgeographic.com/magazine/') # png = re.findall('https://i\.natgeofe\.com\S+?national-geographic-\S+?\.jpg', soup.decode('utf-8')) from datetime import date - url = 'https://www.nationalgeographic.com/magazine/issue/' + (date.today().strftime('%B-%Y')).lower() + + url = ( + 'https://www.nationalgeographic.com/magazine/issue/' + + (date.today().strftime('%B-%Y')).lower() + ) soup = self.index_to_soup(url) - png = soup.find('meta', attrs={'property':'og:image'})['content'].split('?') + png = soup.find('meta', attrs={'property': 'og:image'})['content'].split('?') return png[0] + '?w=1000&h=1000' def parse_index(self): @@ -195,7 +71,7 @@ class NatGeo(BasicNewsRecipe): 'https://www.nationalgeographic.com/environment', 'https://www.nationalgeographic.com/history', 'https://www.nationalgeographic.com/science', - 'https://www.nationalgeographic.com/travel' + 'https://www.nationalgeographic.com/travel', ] feeds = [] @@ -217,16 +93,14 @@ class NatGeo(BasicNewsRecipe): section = self.tag_to_string(article.find(**classes('SectionLabel'))) if section.startswith('Paid Content'): continue - title = self.tag_to_string(article.find(**classes('PromoTile__Title--truncated'))) + title = self.tag_to_string( + article.find(**classes('PromoTile__Title--truncated')) + ) articles = ans.setdefault(section, []) articles.append({'title': title, 'url': url}) self.log(pformat(ans)) return list(ans.items()) - def preprocess_raw_html(self, raw_html, url): - data = extract_json(raw_html) - return '\n'.join(article_parse(data)) - def preprocess_html(self, soup): for h2 in soup.findAll('h2'): h2.name = 'h4' @@ -239,7 +113,7 @@ class NatGeo(BasicNewsRecipe): return soup def populate_article_metadata(self, article, soup, first): - summ = soup.find(attrs={'class':'byline'}) + summ = soup.find(attrs={'class': 'byline'}) if summ: article.summary = self.tag_to_string(summ) article.text_summary = self.tag_to_string(summ) diff --git a/recipes/natgeohis.recipe b/recipes/natgeohis.recipe index b95967520b..61012a18e1 100644 --- a/recipes/natgeohis.recipe +++ b/recipes/natgeohis.recipe @@ -1,150 +1,10 @@ #!/usr/bin/env python # vim:fileencoding=utf-8 -from __future__ import absolute_import, division, print_function, unicode_literals - -import json - -from calibre import prepare_string_for_xml as escape -from calibre.utils.iso8601 import parse_iso8601 -from calibre.web.feeds.news import BasicNewsRecipe - - -def classes(classes): - q = frozenset(classes.split(' ')) - return dict(attrs={ - 'class': lambda x: x and frozenset(x.split()).intersection(q)}) - - -def extract_json(raw): - s = raw.find("window['__natgeo__']") - script = raw[s:raw.find('', s)] - return json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle'] - - -def parse_contributors(grp): - for item in grp: - line = '
' + escape(item['title']) + ' ' - for c in item['contributors']: - line += escape(c['displayName']) - yield line + '
' - - -def parse_lead_image(media): - if 'image' in media: - yield '

' - if 'dsc' in media['image']: - yield '

{}
'.format( - escape(media['image']['src'], True), escape(media['image']['dsc'], True) - ) - else: - yield '
'.format(escape(media['image']['src'], True)) - if 'caption' in media and 'credit' in media: - yield '
' + media['caption'] + ' ' + media['credit'] + '
' - elif 'caption' in media: - yield '
' + media['caption'] + '
' - yield '

' - - -def parse_inline(inl): - if inl.get('content', {}).get('name', '') == 'Image': - props = inl['content']['props'] - yield '

' - if 'image' in props: - yield '

'.format(props['image']['src']) - if 'caption' in props: - yield '
{}{}
'.format( - props['caption'].get('text', ''), ' ' + props['caption'].get('credit', '') - ) - yield '

' - if inl.get('content', {}).get('name', '') == 'ImageGroup': - if 'images' in inl['content']['props']: - for imgs in inl['content']['props']['images']: - yield '

' - if 'src' in imgs: - yield '

'.format(imgs['src']) - if 'caption' in imgs: - yield '
{}{}
'.format( - imgs['caption'].get('text', ''), ' ' + imgs['caption'].get('credit', '') - ) - yield '

' - - -def parse_cont(content): - for cont in content.get('content', {}): - if isinstance(cont, dict): - yield from parse_body(cont) - if isinstance(cont, str): - yield cont - - -def parse_body(x): - if isinstance(x, dict): - if 'type' in x: - tag = x['type'] - if tag == 'inline': - yield ''.join(parse_inline(x)) - elif 'attrs' in x and 'href' in x.get('attrs', ''): - yield '<' + tag + ' href="{}">'.format(x['attrs']['href']) - for yld in parse_cont(x): - yield yld - yield '' - else: - yield '<' + tag + '>' - for yld in parse_cont(x): - yield yld - yield '' - elif isinstance(x, list): - for y in x: - if isinstance(y, dict): - yield from parse_body(y) - - -def parse_article(edg): - sc = edg['schma'] - yield '
' + escape(edg['sctn']) + '
' - yield '

' + escape(sc['sclTtl']) + '

' - yield '
' + escape(sc['sclDsc']) + '
' - yield '

' - for line in parse_contributors(edg.get('cntrbGrp', {})): - yield line - ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y') - yield '

Published: ' + escape(ts) + '
' - if 'readTime' in edg: - yield '
' + escape(edg['readTime']) + '
' - yield '

' - if edg.get('ldMda', {}).get('cmsType') == 'image': - for line in parse_lead_image(edg['ldMda']): - yield line - for main in edg['prismData']['mainComponents']: - if main['name'] == 'Body': - for item in main['props']['body']: - if isinstance(item, dict): - if item.get('type', '') == 'inline': - yield ''.join(parse_inline(item)) - elif isinstance(item, list): - for line in item: - yield ''.join(parse_body(line)) - - -def article_parse(data): - yield "" - for frm in data['frms']: - if not frm: - continue - for mod in frm.get('mods', ()): - for edg in mod.get('edgs', ()): - if edg.get('cmsType') == 'ImmersiveLeadTile': - if 'image' in edg.get('cmsImage', {}): - for line in parse_lead_image(edg['cmsImage']): - yield line - if edg.get('cmsType') == 'ArticleBodyTile': - for line in parse_article(edg): - yield line - yield "" +from calibre.web.feeds.news import BasicNewsRecipe, classes class NatGeo(BasicNewsRecipe): - title = u'National Geographic History' + title = 'National Geographic History' description = ( 'From Caesar to Napoleon, the Pyramids to the Parthenon, the Trojan War to the Civil War—National Geographic ' 'HISTORY draws readers in with more than 5,000 years of people, places, and things to explore.' @@ -167,41 +27,53 @@ class NatGeo(BasicNewsRecipe): 'res': { 'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500', 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.', - 'default': '600' + 'default': '600', } } - extra_css = ''' + @property + def natgeo_parser(self): + ans = getattr(self, '_natgeo_parser', None) + if ans is None: + from calibre.live import load_module + + self._natgeo_parser = ans = load_module('calibre.web.site_parsers.natgeo') + return ans + + def preprocess_raw_html(self, raw_html, url): + return self.natgeo_parser.extract_html(raw_html) + + extra_css = """ blockquote { color:#404040; } .byline, i { font-style:italic; color:#202020; } .cap { font-size:small; } img {display:block; margin:0 auto;} .cred { font-style:italic; font-size:small; color:#404040; } .auth, .time, .sub { font-size:small; color:#5c5c5c; } - ''' + """ def get_cover_url(self): soup = self.index_to_soup('https://ngsingleissues.nationalgeographic.com/history') - wrap = soup.find(attrs={'class':'product-image-wrapper'}) + wrap = soup.find(attrs={'class': 'product-image-wrapper'}) return wrap.img['src'] def parse_index(self): - soup = self.index_to_soup('https://www.nationalgeographic.com/history/history-magazine') + soup = self.index_to_soup( + 'https://www.nationalgeographic.com/history/history-magazine' + ) ans = [] for article in soup.findAll('article'): a = article.find('a') url = a['href'] if url.startswith('/'): url = 'https://www.nationalgeographic.com' + url - title = self.tag_to_string(article.find(**classes('PromoTile__Title--truncated'))) + title = self.tag_to_string( + article.find(**classes('PromoTile__Title--truncated')) + ) ans.append({'title': title, 'url': url}) self.log(title, ' ', url) return [('Articles', ans)] - def preprocess_raw_html(self, raw_html, url): - data = extract_json(raw_html) - return '\n'.join(article_parse(data)) - def preprocess_html(self, soup): for h2 in soup.findAll('h2'): h2.name = 'h4' @@ -214,7 +86,7 @@ class NatGeo(BasicNewsRecipe): return soup def populate_article_metadata(self, article, soup, first): - summ = soup.find(attrs={'class':'byline'}) + summ = soup.find(attrs={'class': 'byline'}) if summ: article.summary = self.tag_to_string(summ) article.text_summary = self.tag_to_string(summ) diff --git a/recipes/natgeomag.recipe b/recipes/natgeomag.recipe index f13e891493..97f9852961 100644 --- a/recipes/natgeomag.recipe +++ b/recipes/natgeomag.recipe @@ -1,152 +1,12 @@ #!/usr/bin/env python # vim:fileencoding=utf-8 -from __future__ import absolute_import, division, print_function, unicode_literals - -import json from datetime import date from pprint import pformat - -from calibre import prepare_string_for_xml as escape -from calibre.utils.iso8601 import parse_iso8601 -from calibre.web.feeds.news import BasicNewsRecipe - - -def classes(classes): - q = frozenset(classes.split(' ')) - return dict(attrs={ - 'class': lambda x: x and frozenset(x.split()).intersection(q)}) - - -def extract_json(raw): - s = raw.find("window['__natgeo__']") - script = raw[s:raw.find('', s)] - return json.loads(script[script.find('{'):].rstrip(';'))['page']['content']['prismarticle'] - - -def parse_contributors(grp): - for item in grp: - line = '
' + escape(item['title']) + ' ' - for c in item['contributors']: - line += escape(c['displayName']) - yield line + '
' - - -def parse_lead_image(media): - if 'image' in media: - yield '

' - if 'dsc' in media['image']: - yield '

{}
'.format( - escape(media['image']['src'], True), escape(media['image']['dsc'], True) - ) - else: - yield '
'.format(escape(media['image']['src'], True)) - if 'caption' in media and 'credit' in media: - yield '
' + media['caption'] + ' ' + media['credit'] + '
' - elif 'caption' in media: - yield '
' + media['caption'] + '
' - yield '

' - - -def parse_inline(inl): - if inl.get('content', {}).get('name', '') == 'Image': - props = inl['content']['props'] - yield '

' - if 'image' in props: - yield '

'.format(props['image']['src']) - if 'caption' in props: - yield '
{}{}
'.format( - props['caption'].get('text', ''), ' ' + props['caption'].get('credit', '') - ) - yield '

' - if inl.get('content', {}).get('name', '') == 'ImageGroup': - if 'images' in inl['content']['props']: - for imgs in inl['content']['props']['images']: - yield '

' - if 'src' in imgs: - yield '

'.format(imgs['src']) - if 'caption' in imgs: - yield '
{}{}
'.format( - imgs['caption'].get('text', ''), ' ' + imgs['caption'].get('credit', '') - ) - yield '

' - - -def parse_cont(content): - for cont in content.get('content', {}): - if isinstance(cont, dict): - yield from parse_body(cont) - if isinstance(cont, str): - yield cont - - -def parse_body(x): - if isinstance(x, dict): - if 'type' in x: - tag = x['type'] - if tag == 'inline': - yield ''.join(parse_inline(x)) - elif 'attrs' in x and 'href' in x.get('attrs', ''): - yield '<' + tag + ' href="{}">'.format(x['attrs']['href']) - for yld in parse_cont(x): - yield yld - yield '' - else: - yield '<' + tag + '>' - for yld in parse_cont(x): - yield yld - yield '' - elif isinstance(x, list): - for y in x: - if isinstance(y, dict): - yield from parse_body(y) - - -def parse_article(edg): - sc = edg['schma'] - yield '
' + escape(edg['sctn']) + '
' - yield '

' + escape(sc['sclTtl']) + '

' - yield '
' + escape(sc['sclDsc']) + '
' - yield '

' - for line in parse_contributors(edg.get('cntrbGrp', {})): - yield line - ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y') - yield '

Published: ' + escape(ts) + '
' - if 'readTime' in edg: - yield '
' + escape(edg['readTime']) + '
' - yield '

' - if edg.get('ldMda', {}).get('cmsType') == 'image': - for line in parse_lead_image(edg['ldMda']): - yield line - for main in edg['prismData']['mainComponents']: - if main['name'] == 'Body': - for item in main['props']['body']: - if isinstance(item, dict): - if item.get('type', '') == 'inline': - yield ''.join(parse_inline(item)) - elif isinstance(item, list): - for line in item: - yield ''.join(parse_body(line)) - - -def article_parse(data): - yield "" - for frm in data['frms']: - if not frm: - continue - for mod in frm.get('mods', ()): - for edg in mod.get('edgs', ()): - if edg.get('cmsType') == 'ImmersiveLeadTile': - if 'image' in edg.get('cmsImage', {}): - for line in parse_lead_image(edg['cmsImage']): - yield line - if edg.get('cmsType') == 'ArticleBodyTile': - for line in parse_article(edg): - yield line - yield "" +from calibre.web.feeds.news import BasicNewsRecipe, classes class NatGeo(BasicNewsRecipe): - title = u'National Geographic Magazine' + title = 'National Geographic Magazine' description = 'The National Geographic, an American monthly magazine' language = 'en' encoding = 'utf8' @@ -163,26 +23,38 @@ class NatGeo(BasicNewsRecipe): remove_empty_feeds = True resolve_internal_links = True - extra_css = ''' + recipe_specific_options = { + 'date': { + 'short': 'The date of the edition to download (Month-YYYY format)', + 'long': 'For example, March-2023', + }, + 'res': { + 'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500', + 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.', + 'default': '600', + }, + } + + @property + def natgeo_parser(self): + ans = getattr(self, '_natgeo_parser', None) + if ans is None: + from calibre.live import load_module + + self._natgeo_parser = ans = load_module('calibre.web.site_parsers.natgeo') + return ans + + def preprocess_raw_html(self, raw_html, url): + return self.natgeo_parser.extract_html(raw_html) + + extra_css = """ blockquote { color:#404040; } .byline, i { font-style:italic; color:#202020; } .cap { font-size:small; } img {display:block; margin:0 auto;} .cred { font-style:italic; font-size:small; color:#404040; } .auth, .time, .sub { font-size:small; color:#5c5c5c; } - ''' - - recipe_specific_options = { - 'date': { - 'short': 'The date of the edition to download (Month-YYYY format)', - 'long': 'For example, March-2023' - }, - 'res': { - 'short': 'For hi-res images, select a resolution from the\nfollowing options: 800, 1000, 1200 or 1500', - 'long': 'This is useful for non e-ink devices, and for a lower file size\nthan the default, use 400 or 300.', - 'default': '600' - } - } + """ def parse_index(self): edition = date.today().strftime('%B-%Y') @@ -195,11 +67,19 @@ class NatGeo(BasicNewsRecipe): soup = self.index_to_soup(url) # png = re.findall('https://i\.natgeofe\.com\S+?national-geographic-\S+?\.jpg', soup.decode('utf-8')) # self.cover_url = png[0] + '?w=1000&h=1000' - self.cover_url = soup.find('meta', attrs={'property':'og:image'})['content'].split('?')[0] + '?w=1000' + self.cover_url = ( + soup.find('meta', attrs={'property': 'og:image'})['content'].split('?')[0] + + '?w=1000' + ) # self.title = 'National Geographic ' + self.tag_to_string(name) ans = {} - if photoart := soup.find(attrs={'class':lambda x: x and 'BgImagePromo__Container__Text__Link' in x.split()}): + if photoart := soup.find( + attrs={ + 'class': lambda x: x + and 'BgImagePromo__Container__Text__Link' in x.split() + } + ): section = 'Photo Essay' title = self.tag_to_string(photoart) url = photoart['href'] @@ -211,10 +91,12 @@ class NatGeo(BasicNewsRecipe): if promo.find('a', attrs={'href': True}) and promo.a.get('href'): url = promo.a['href'] section = self.tag_to_string(promo.find(**classes('SectionLabel'))) - title = self.tag_to_string(promo.find(**classes('Card__Content__Heading'))) + title = self.tag_to_string( + promo.find(**classes('Card__Content__Heading')) + ) articles = ans.setdefault(section, []) articles.append({'title': title, 'url': url}) - for gird in soup.findAll(attrs={'class':'GridPromoTile'}): + for gird in soup.findAll(attrs={'class': 'GridPromoTile'}): for article in soup.findAll('article'): a = article.find('a') url = a['href'] @@ -223,16 +105,14 @@ class NatGeo(BasicNewsRecipe): if '/graphics/' in url: continue section = self.tag_to_string(article.find(**classes('SectionLabel'))) - title = self.tag_to_string(article.find(**classes('PromoTile__Title--truncated'))) + title = self.tag_to_string( + article.find(**classes('PromoTile__Title--truncated')) + ) articles = ans.setdefault(section, []) articles.append({'title': title, 'url': url}) self.log(pformat(ans)) return list(ans.items()) - def preprocess_raw_html(self, raw_html, url): - data = extract_json(raw_html) - return '\n'.join(article_parse(data)) - def preprocess_html(self, soup): for h2 in soup.findAll('h2'): h2.name = 'h4' @@ -245,7 +125,7 @@ class NatGeo(BasicNewsRecipe): return soup def populate_article_metadata(self, article, soup, first): - summ = soup.find(attrs={'class':'byline'}) + summ = soup.find(attrs={'class': 'byline'}) if summ: article.summary = self.tag_to_string(summ) article.text_summary = self.tag_to_string(summ) diff --git a/src/calibre/gui2/dialogs/trim_image.py b/src/calibre/gui2/dialogs/trim_image.py index b3c039821d..2c576f506e 100644 --- a/src/calibre/gui2/dialogs/trim_image.py +++ b/src/calibre/gui2/dialogs/trim_image.py @@ -28,10 +28,10 @@ from calibre.gui2 import gprefs from calibre.gui2.tweak_book.editor.canvas import Canvas -def reduce_to_ratio(w, h, t): - h = min(h, w / t) - w = t * h - return int(w), int(h) +def reduce_to_ratio(w, h, r): + h = min(h, w / r) + w = r * h + return int(round(w)), int(round(h)) class Region(QDialog): diff --git a/src/calibre/web/site_parsers/natgeo.py b/src/calibre/web/site_parsers/natgeo.py new file mode 100644 index 0000000000..6990bd2fdc --- /dev/null +++ b/src/calibre/web/site_parsers/natgeo.py @@ -0,0 +1,156 @@ +#!/usr/bin/env python + +from __future__ import ( + absolute_import, + division, + print_function, + unicode_literals, +) + +import json +from pprint import pprint + +from calibre import prepare_string_for_xml as escape +from calibre.utils.iso8601 import parse_iso8601 + + +module_version = 1 # needed for live updates +pprint + + +def extract_json(raw): + s = raw.find("window['__natgeo__']") + script = raw[s : raw.find('', s)] + return json.loads(script[script.find('{') :].rstrip(';'))['page']['content'][ + 'prismarticle' + ] + + +def parse_contributors(grp): + for item in grp: + line = '
' + escape(item['title']) + ' ' + for c in item['contributors']: + line += escape(c['displayName']) + yield line + '
' + + +def parse_lead_image(media): + if 'image' in media: + yield '

' + if 'dsc' in media['image']: + yield ( + f'

' + ) + else: + yield f'
' + if 'caption' in media and 'credit' in media: + yield ( + '
' + + media['caption'] + + ' ' + + media['credit'] + + '
' + ) + elif 'caption' in media: + yield '
' + media['caption'] + '
' + yield '

' + + +def parse_inline(inl): + if inl.get('content', {}).get('name', '') == 'Image': + props = inl['content']['props'] + yield '

' + if 'image' in props: + yield f'

' + if 'caption' in props: + yield ( + f'
{props["caption"].get("text", "")} {props["caption"].get("credit", "")}
' + ) + yield '

' + if inl.get('content', {}).get('name', '') == 'ImageGroup': + if 'images' in inl['content']['props']: + for imgs in inl['content']['props']['images']: + yield '

' + if 'src' in imgs: + yield f'

' + if 'caption' in imgs: + yield ( + f'
{imgs["caption"].get("text", "")} {imgs["caption"].get("credit", "")}
' + ) + yield '

' + + +def parse_cont(content): + for cont in content.get('content', {}): + if isinstance(cont, dict): + yield from parse_body(cont) + if isinstance(cont, str): + yield cont + + +def parse_body(x): + if isinstance(x, dict): + if 'type' in x: + tag = x['type'] + if tag == 'inline': + yield ''.join(parse_inline(x)) + elif 'attrs' in x and 'href' in x.get('attrs', ''): + yield '<' + tag + f' href="{x["attrs"]["href"]}">' + yield from parse_cont(x) + yield '' + else: + yield '<' + tag + '>' + yield from parse_cont(x) + yield '' + elif isinstance(x, list): + for y in x: + if isinstance(y, dict): + yield from parse_body(y) + + +def parse_article(edg): + sc = edg['schma'] + yield '
' + escape(edg['sctn']) + '
' + yield '

' + escape(sc['sclTtl']) + '

' + yield '
' + escape(sc['sclDsc']) + '
' + yield '

' + yield from parse_contributors(edg.get('cntrbGrp', {})) + ts = parse_iso8601(edg['mdDt'], as_utc=False).strftime('%B %d, %Y') + yield '

Published: ' + escape(ts) + '
' + if 'readTime' in edg: + yield '
' + escape(edg['readTime']) + '
' + yield '

' + if edg.get('ldMda', {}).get('cmsType') == 'image': + yield from parse_lead_image(edg['ldMda']) + for main in edg['prismData']['mainComponents']: + if main['name'] == 'Body': + for item in main['props']['body']: + if isinstance(item, dict): + if item.get('type', '') == 'inline': + yield ''.join(parse_inline(item)) + elif isinstance(item, list): + for line in item: + yield ''.join(parse_body(line)) + + +def article_parse(data): + yield '' + for frm in data['frms']: + if not frm: + continue + for mod in frm.get('mods', ()): + for edg in mod.get('edgs', ()): + if edg.get('cmsType') == 'ImmersiveLeadTile': + if 'image' in edg.get('cmsImage', {}): + yield from parse_lead_image(edg['cmsImage']) + if edg.get('cmsType') == 'ArticleBodyTile': + yield from parse_article(edg) + yield '' + + +def extract_html(raw_html): + data = extract_json(raw_html) + return '\n'.join(article_parse(data))