Merge branch 'master' of github.com:kovidgoyal/calibre

This commit is contained in:
Kovid Goyal 2019-12-13 18:09:42 +05:30
commit 15e247f88a
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
2 changed files with 132 additions and 24 deletions

View File

@ -1,19 +1,67 @@
#!/usr/bin/env python2
# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
economist.com
'''
try:
from http.cookiejar import Cookie
except ImportError:
from cookielib import Cookie
import json
from collections import OrderedDict
from html5_parser import parse
from lxml import etree
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.cleantext import clean_ascii_chars
from calibre.web.feeds.news import BasicNewsRecipe
def E(parent, name, text='', **attrs):
ans = parent.makeelement(name, **attrs)
ans.text = text
parent.append(ans)
return ans
def process_node(node, html_parent):
ntype = node.get('type')
if ntype == 'tag':
c = html_parent.makeelement(node['name'])
c.attrib.update(node.get('attribs', {}))
html_parent.append(c)
for nc in node.get('children', ()):
process_node(nc, c)
elif ntype == 'text':
text = node.get('data')
if text:
if len(html_parent):
t = html_parent[-1]
t.tail = (t.tail or '') + text
else:
html_parent.text = (html_parent.text or '') + text
def load_article_from_json(raw, root):
data = json.loads(raw)['props']['pageProps']['content']
# open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True))
body = root.xpath('//body')[0]
for child in tuple(body):
body.remove(child)
article = E(body, 'article')
E(article, 'h4', data['subheadline'], style='color: red; margin: 0')
E(article, 'h1', data['headline'], style='font-size: x-large')
E(article, 'div', data['description'], style='font-style: italic')
E(article, 'div', (data['datePublishedString'] or '') + ' | ' + (data['dateline'] or ''), style='color: gray; margin: 1em')
images = data['image']
if 'main' in images:
div = E(article, 'div')
try:
E(div, 'img', src=images['main']['url']['canonical'])
except Exception:
pass
text = data['text']
for node in text:
process_node(node, article)
def classes(classes):
@ -79,6 +127,7 @@ class Economist(BasicNewsRecipe):
resolve_internal_links = True
remove_tags = [
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']),
dict(attrs={'aria-label': "Article Teaser"}),
dict(attrs={
'class': [
'dblClkTrk', 'ec-article-info', 'share_inline_header',
@ -92,7 +141,10 @@ class Economist(BasicNewsRecipe):
),
dict(attrs={
'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}),
classes('share-links-header teaser--wrapped latest-updates-panel__container latest-updates-panel__article-link blog-post__section newsletter-form')
classes(
'share-links-header teaser--wrapped latest-updates-panel__container'
' latest-updates-panel__article-link blog-post__section newsletter-form blog-post__bottom-panel'
)
]
keep_only_tags = [dict(name='article', id=lambda x: not x)]
no_stylesheets = True
@ -140,13 +192,15 @@ class Economist(BasicNewsRecipe):
return br
def preprocess_raw_html(self, raw, url):
import html5lib
root = html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml')
from lxml import etree
# open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
root = parse(raw)
script = root.xpath('//script[@id="__NEXT_DATA__"]')
if script:
load_article_from_json(script[0].text, root)
for div in root.xpath('//div[@class="lazy-image"]'):
noscript = list(div.iter('noscript'))
if noscript and noscript[0].text:
img = list(html5lib.parse(noscript[0].text, namespaceHTMLElements=False, treebuilder='lxml').iter('img'))
img = list(parse(noscript[0].text).iter('img'))
if img:
p = noscript[0].getparent()
idx = p.index(noscript[0])
@ -171,7 +225,7 @@ class Economist(BasicNewsRecipe):
def parse_index(self):
# return [('Articles', [{'title':'test',
# 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'
# 'url':'file:///t/raw.html'
# }])]
raw = self.index_to_soup(self.INDEX, raw=True)
# with open('/t/raw.html', 'wb') as f:

View File

@ -1,19 +1,67 @@
#!/usr/bin/env python2
# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
__license__ = 'GPL v3'
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
'''
economist.com
'''
try:
from http.cookiejar import Cookie
except ImportError:
from cookielib import Cookie
import json
from collections import OrderedDict
from html5_parser import parse
from lxml import etree
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from calibre.web.feeds.news import BasicNewsRecipe
from calibre.utils.cleantext import clean_ascii_chars
from calibre.web.feeds.news import BasicNewsRecipe
def E(parent, name, text='', **attrs):
ans = parent.makeelement(name, **attrs)
ans.text = text
parent.append(ans)
return ans
def process_node(node, html_parent):
ntype = node.get('type')
if ntype == 'tag':
c = html_parent.makeelement(node['name'])
c.attrib.update(node.get('attribs', {}))
html_parent.append(c)
for nc in node.get('children', ()):
process_node(nc, c)
elif ntype == 'text':
text = node.get('data')
if text:
if len(html_parent):
t = html_parent[-1]
t.tail = (t.tail or '') + text
else:
html_parent.text = (html_parent.text or '') + text
def load_article_from_json(raw, root):
data = json.loads(raw)['props']['pageProps']['content']
# open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True))
body = root.xpath('//body')[0]
for child in tuple(body):
body.remove(child)
article = E(body, 'article')
E(article, 'h4', data['subheadline'], style='color: red; margin: 0')
E(article, 'h1', data['headline'], style='font-size: x-large')
E(article, 'div', data['description'], style='font-style: italic')
E(article, 'div', (data['datePublishedString'] or '') + ' | ' + (data['dateline'] or ''), style='color: gray; margin: 1em')
images = data['image']
if 'main' in images:
div = E(article, 'div')
try:
E(div, 'img', src=images['main']['url']['canonical'])
except Exception:
pass
text = data['text']
for node in text:
process_node(node, article)
def classes(classes):
@ -79,6 +127,7 @@ class Economist(BasicNewsRecipe):
resolve_internal_links = True
remove_tags = [
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']),
dict(attrs={'aria-label': "Article Teaser"}),
dict(attrs={
'class': [
'dblClkTrk', 'ec-article-info', 'share_inline_header',
@ -92,7 +141,10 @@ class Economist(BasicNewsRecipe):
),
dict(attrs={
'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}),
classes('share-links-header teaser--wrapped latest-updates-panel__container latest-updates-panel__article-link blog-post__section newsletter-form')
classes(
'share-links-header teaser--wrapped latest-updates-panel__container'
' latest-updates-panel__article-link blog-post__section newsletter-form blog-post__bottom-panel'
)
]
keep_only_tags = [dict(name='article', id=lambda x: not x)]
no_stylesheets = True
@ -140,13 +192,15 @@ class Economist(BasicNewsRecipe):
return br
def preprocess_raw_html(self, raw, url):
import html5lib
root = html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml')
from lxml import etree
# open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
root = parse(raw)
script = root.xpath('//script[@id="__NEXT_DATA__"]')
if script:
load_article_from_json(script[0].text, root)
for div in root.xpath('//div[@class="lazy-image"]'):
noscript = list(div.iter('noscript'))
if noscript and noscript[0].text:
img = list(html5lib.parse(noscript[0].text, namespaceHTMLElements=False, treebuilder='lxml').iter('img'))
img = list(parse(noscript[0].text).iter('img'))
if img:
p = noscript[0].getparent()
idx = p.index(noscript[0])
@ -171,7 +225,7 @@ class Economist(BasicNewsRecipe):
def parse_index(self):
# return [('Articles', [{'title':'test',
# 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'
# 'url':'file:///t/raw.html'
# }])]
raw = self.index_to_soup(self.INDEX, raw=True)
# with open('/t/raw.html', 'wb') as f: