mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 10:44:09 -04:00
Merge branch 'master' of https://github.com/unkn0w7n/calibre
This commit is contained in:
commit
7a7f93c61a
@ -2,7 +2,6 @@
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import html5lib
|
|
||||||
import mechanize
|
import mechanize
|
||||||
from lxml import html
|
from lxml import html
|
||||||
|
|
||||||
@ -86,7 +85,7 @@ def get_issue_data(br, log, node_id='1126213', year='2020', volnum='99', issue_v
|
|||||||
))[0]['_source']
|
))[0]['_source']
|
||||||
|
|
||||||
if 'field_issue_sspecial_articles__nid' in issue_data:
|
if 'field_issue_sspecial_articles__nid' in issue_data:
|
||||||
main_sec_title = issue_data['title'][0]
|
main_sec_title = issue_data['field_issue_sspecial_header'][0]
|
||||||
main_sec_nids = issue_data['field_issue_sspecial_articles__nid']
|
main_sec_nids = issue_data['field_issue_sspecial_articles__nid']
|
||||||
articles_data = get_data(make_query(nid=main_sec_nids, filter=True, size=len(main_sec_nids)))
|
articles_data = get_data(make_query(nid=main_sec_nids, filter=True, size=len(main_sec_nids)))
|
||||||
articles = []
|
articles = []
|
||||||
@ -121,10 +120,18 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
|||||||
publisher = u'Council on Foreign Relations'
|
publisher = u'Council on Foreign Relations'
|
||||||
category = u'USA, Foreign Affairs'
|
category = u'USA, Foreign Affairs'
|
||||||
description = u'The leading forum for serious discussion of American foreign policy and international affairs.'
|
description = u'The leading forum for serious discussion of American foreign policy and international affairs.'
|
||||||
|
encoding = 'utf-8'
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
needs_subscription = 'optional'
|
needs_subscription = 'optional'
|
||||||
|
remove_attributes = ['style', 'height', 'width']
|
||||||
|
masthead_url = 'https://upload.wikimedia.org/wikipedia/commons/thumb/9/9a/Foreign_Affairs_Logo.svg/500px-Foreign_Affairs_Logo.svg.png'
|
||||||
|
extra_css = '''
|
||||||
|
.topper__issue, .topper__date, .topper__byline, figure__caption, .calibre-nuked-tag-figcaption { font-size: small; }
|
||||||
|
.topper__subtitle { font-style: italic; color: #202020; }
|
||||||
|
em, blockquote { color: #202020; }
|
||||||
|
img {display:block; margin:0 auto;}
|
||||||
|
'''
|
||||||
|
|
||||||
INDEX = 'https://www.foreignaffairs.com/magazine'
|
INDEX = 'https://www.foreignaffairs.com/magazine'
|
||||||
|
|
||||||
@ -136,10 +143,12 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
|||||||
}
|
}
|
||||||
|
|
||||||
keep_only_tags = [
|
keep_only_tags = [
|
||||||
classes('article-header article-body article-lead-image article-body-text'),
|
classes('topper__heading-container topper__image-container paywall-content'),
|
||||||
]
|
]
|
||||||
|
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
classes('loading-indicator paywall article-footer article-tools')
|
dict(name=['svg', 'meta']),
|
||||||
|
classes('article-newsletter-signup--container dfp-tag-wrapper')
|
||||||
]
|
]
|
||||||
|
|
||||||
conversion_options = {'comments': description, 'tags': category, 'language': 'en',
|
conversion_options = {'comments': description, 'tags': category, 'language': 'en',
|
||||||
@ -160,39 +169,27 @@ class ForeignAffairsRecipe(BasicNewsRecipe):
|
|||||||
self.timefmt = u' [%s]' % date
|
self.timefmt = u' [%s]' % date
|
||||||
link = soup.find('link', rel='canonical', href=True)['href']
|
link = soup.find('link', rel='canonical', href=True)['href']
|
||||||
year, volnum, issue_vol = link.split('/')[-3:]
|
year, volnum, issue_vol = link.split('/')[-3:]
|
||||||
|
cov = soup.find('img', attrs={'srcset': lambda x: x and 'Cover.jpg' in x})
|
||||||
|
if cov:
|
||||||
self.cover_url = re.sub(
|
self.cover_url = re.sub(
|
||||||
r"_webp_issue_small_\dx",
|
r"_webp_issue_small_\dx",
|
||||||
"_webp_issue_large_2x",
|
"_webp_issue_large_2x",
|
||||||
soup.find(class_="subscribe-callout-image")["srcset"]
|
cov["srcset"].split()[0]
|
||||||
.split(",")[0]
|
|
||||||
.strip()
|
|
||||||
.split(" ")[0],
|
|
||||||
)
|
)
|
||||||
|
|
||||||
cls = soup.find('body')['class']
|
cls = soup.find('link', attrs={'rel':'shortlink'})['href']
|
||||||
if isinstance(cls, (list, tuple)):
|
node_id = re.search(r'https://www.foreignaffairs.com/node/(\d+)', cls).group(1)
|
||||||
cls = ' '.join(cls)
|
|
||||||
node_id = re.search(r'\bpage-node-(\d+)\b', cls).group(1)
|
|
||||||
br = self.cloned_browser
|
br = self.cloned_browser
|
||||||
feeds = get_issue_data(br, self.log, node_id, year, volnum, issue_vol)
|
feeds = get_issue_data(br, self.log, node_id, year, volnum, issue_vol)
|
||||||
return feeds
|
return feeds
|
||||||
|
|
||||||
def clean_fa_html(self, root):
|
|
||||||
for svg in tuple(root.iter('{*}svg')):
|
|
||||||
svg.getparent().remove(svg)
|
|
||||||
for meta in tuple(root.iter('{*}meta')):
|
|
||||||
meta.getparent().remove(meta)
|
|
||||||
return root
|
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw_html, url):
|
|
||||||
root = html5lib.parse(raw_html, treebuilder='lxml',
|
|
||||||
namespaceHTMLElements=False).getroot()
|
|
||||||
self.clean_fa_html(root)
|
|
||||||
return html.tostring(root, encoding='unicode')
|
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
|
for h2 in soup.findAll(**classes('topper__subtitle')):
|
||||||
|
h2.name = 'p'
|
||||||
|
for by in soup.findAll(**classes('topper__byline topper__issue topper__date')):
|
||||||
|
by.name = 'div'
|
||||||
for img in soup.find_all('img', attrs={'srcset': True}):
|
for img in soup.find_all('img', attrs={'srcset': True}):
|
||||||
img['src'] = img['srcset'].split(',')[-1].strip().split(' ')[0].strip()
|
img['src'] = re.sub(r"_webp_small_\dx", "_webp_large_1x",img['srcset'].split()[0])
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def get_browser(self):
|
def get_browser(self):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user