mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-08 18:54:09 -04:00
Merge branch 'master' of github.com:kovidgoyal/calibre
This commit is contained in:
commit
15e247f88a
@ -1,19 +1,67 @@
|
|||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
|
# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
||||||
'''
|
|
||||||
economist.com
|
|
||||||
'''
|
|
||||||
try:
|
try:
|
||||||
from http.cookiejar import Cookie
|
from http.cookiejar import Cookie
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from cookielib import Cookie
|
from cookielib import Cookie
|
||||||
|
import json
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
from html5_parser import parse
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
|
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
|
def E(parent, name, text='', **attrs):
|
||||||
|
ans = parent.makeelement(name, **attrs)
|
||||||
|
ans.text = text
|
||||||
|
parent.append(ans)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
def process_node(node, html_parent):
|
||||||
|
ntype = node.get('type')
|
||||||
|
if ntype == 'tag':
|
||||||
|
c = html_parent.makeelement(node['name'])
|
||||||
|
c.attrib.update(node.get('attribs', {}))
|
||||||
|
html_parent.append(c)
|
||||||
|
for nc in node.get('children', ()):
|
||||||
|
process_node(nc, c)
|
||||||
|
elif ntype == 'text':
|
||||||
|
text = node.get('data')
|
||||||
|
if text:
|
||||||
|
if len(html_parent):
|
||||||
|
t = html_parent[-1]
|
||||||
|
t.tail = (t.tail or '') + text
|
||||||
|
else:
|
||||||
|
html_parent.text = (html_parent.text or '') + text
|
||||||
|
|
||||||
|
|
||||||
|
def load_article_from_json(raw, root):
|
||||||
|
data = json.loads(raw)['props']['pageProps']['content']
|
||||||
|
# open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True))
|
||||||
|
body = root.xpath('//body')[0]
|
||||||
|
for child in tuple(body):
|
||||||
|
body.remove(child)
|
||||||
|
article = E(body, 'article')
|
||||||
|
E(article, 'h4', data['subheadline'], style='color: red; margin: 0')
|
||||||
|
E(article, 'h1', data['headline'], style='font-size: x-large')
|
||||||
|
E(article, 'div', data['description'], style='font-style: italic')
|
||||||
|
E(article, 'div', (data['datePublishedString'] or '') + ' | ' + (data['dateline'] or ''), style='color: gray; margin: 1em')
|
||||||
|
images = data['image']
|
||||||
|
if 'main' in images:
|
||||||
|
div = E(article, 'div')
|
||||||
|
try:
|
||||||
|
E(div, 'img', src=images['main']['url']['canonical'])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
text = data['text']
|
||||||
|
for node in text:
|
||||||
|
process_node(node, article)
|
||||||
|
|
||||||
|
|
||||||
def classes(classes):
|
def classes(classes):
|
||||||
@ -79,6 +127,7 @@ class Economist(BasicNewsRecipe):
|
|||||||
resolve_internal_links = True
|
resolve_internal_links = True
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']),
|
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']),
|
||||||
|
dict(attrs={'aria-label': "Article Teaser"}),
|
||||||
dict(attrs={
|
dict(attrs={
|
||||||
'class': [
|
'class': [
|
||||||
'dblClkTrk', 'ec-article-info', 'share_inline_header',
|
'dblClkTrk', 'ec-article-info', 'share_inline_header',
|
||||||
@ -92,7 +141,10 @@ class Economist(BasicNewsRecipe):
|
|||||||
),
|
),
|
||||||
dict(attrs={
|
dict(attrs={
|
||||||
'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}),
|
'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}),
|
||||||
classes('share-links-header teaser--wrapped latest-updates-panel__container latest-updates-panel__article-link blog-post__section newsletter-form')
|
classes(
|
||||||
|
'share-links-header teaser--wrapped latest-updates-panel__container'
|
||||||
|
' latest-updates-panel__article-link blog-post__section newsletter-form blog-post__bottom-panel'
|
||||||
|
)
|
||||||
]
|
]
|
||||||
keep_only_tags = [dict(name='article', id=lambda x: not x)]
|
keep_only_tags = [dict(name='article', id=lambda x: not x)]
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
@ -140,13 +192,15 @@ class Economist(BasicNewsRecipe):
|
|||||||
return br
|
return br
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw, url):
|
def preprocess_raw_html(self, raw, url):
|
||||||
import html5lib
|
# open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
|
||||||
root = html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml')
|
root = parse(raw)
|
||||||
from lxml import etree
|
script = root.xpath('//script[@id="__NEXT_DATA__"]')
|
||||||
|
if script:
|
||||||
|
load_article_from_json(script[0].text, root)
|
||||||
for div in root.xpath('//div[@class="lazy-image"]'):
|
for div in root.xpath('//div[@class="lazy-image"]'):
|
||||||
noscript = list(div.iter('noscript'))
|
noscript = list(div.iter('noscript'))
|
||||||
if noscript and noscript[0].text:
|
if noscript and noscript[0].text:
|
||||||
img = list(html5lib.parse(noscript[0].text, namespaceHTMLElements=False, treebuilder='lxml').iter('img'))
|
img = list(parse(noscript[0].text).iter('img'))
|
||||||
if img:
|
if img:
|
||||||
p = noscript[0].getparent()
|
p = noscript[0].getparent()
|
||||||
idx = p.index(noscript[0])
|
idx = p.index(noscript[0])
|
||||||
@ -171,7 +225,7 @@ class Economist(BasicNewsRecipe):
|
|||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
# return [('Articles', [{'title':'test',
|
# return [('Articles', [{'title':'test',
|
||||||
# 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'
|
# 'url':'file:///t/raw.html'
|
||||||
# }])]
|
# }])]
|
||||||
raw = self.index_to_soup(self.INDEX, raw=True)
|
raw = self.index_to_soup(self.INDEX, raw=True)
|
||||||
# with open('/t/raw.html', 'wb') as f:
|
# with open('/t/raw.html', 'wb') as f:
|
||||||
|
@ -1,19 +1,67 @@
|
|||||||
#!/usr/bin/env python2
|
#!/usr/bin/env python2
|
||||||
|
# License: GPLv3 Copyright: 2008, Kovid Goyal <kovid at kovidgoyal.net>
|
||||||
|
|
||||||
__license__ = 'GPL v3'
|
|
||||||
__copyright__ = '2008, Kovid Goyal <kovid at kovidgoyal.net>'
|
|
||||||
'''
|
|
||||||
economist.com
|
|
||||||
'''
|
|
||||||
try:
|
try:
|
||||||
from http.cookiejar import Cookie
|
from http.cookiejar import Cookie
|
||||||
except ImportError:
|
except ImportError:
|
||||||
from cookielib import Cookie
|
from cookielib import Cookie
|
||||||
|
import json
|
||||||
from collections import OrderedDict
|
from collections import OrderedDict
|
||||||
|
|
||||||
|
from html5_parser import parse
|
||||||
|
from lxml import etree
|
||||||
|
|
||||||
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
|
from calibre.ebooks.BeautifulSoup import NavigableString, Tag
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
|
||||||
from calibre.utils.cleantext import clean_ascii_chars
|
from calibre.utils.cleantext import clean_ascii_chars
|
||||||
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
|
|
||||||
|
def E(parent, name, text='', **attrs):
|
||||||
|
ans = parent.makeelement(name, **attrs)
|
||||||
|
ans.text = text
|
||||||
|
parent.append(ans)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
def process_node(node, html_parent):
|
||||||
|
ntype = node.get('type')
|
||||||
|
if ntype == 'tag':
|
||||||
|
c = html_parent.makeelement(node['name'])
|
||||||
|
c.attrib.update(node.get('attribs', {}))
|
||||||
|
html_parent.append(c)
|
||||||
|
for nc in node.get('children', ()):
|
||||||
|
process_node(nc, c)
|
||||||
|
elif ntype == 'text':
|
||||||
|
text = node.get('data')
|
||||||
|
if text:
|
||||||
|
if len(html_parent):
|
||||||
|
t = html_parent[-1]
|
||||||
|
t.tail = (t.tail or '') + text
|
||||||
|
else:
|
||||||
|
html_parent.text = (html_parent.text or '') + text
|
||||||
|
|
||||||
|
|
||||||
|
def load_article_from_json(raw, root):
|
||||||
|
data = json.loads(raw)['props']['pageProps']['content']
|
||||||
|
# open('/t/raw.json', 'w').write(json.dumps(data, indent=2, sort_keys=True))
|
||||||
|
body = root.xpath('//body')[0]
|
||||||
|
for child in tuple(body):
|
||||||
|
body.remove(child)
|
||||||
|
article = E(body, 'article')
|
||||||
|
E(article, 'h4', data['subheadline'], style='color: red; margin: 0')
|
||||||
|
E(article, 'h1', data['headline'], style='font-size: x-large')
|
||||||
|
E(article, 'div', data['description'], style='font-style: italic')
|
||||||
|
E(article, 'div', (data['datePublishedString'] or '') + ' | ' + (data['dateline'] or ''), style='color: gray; margin: 1em')
|
||||||
|
images = data['image']
|
||||||
|
if 'main' in images:
|
||||||
|
div = E(article, 'div')
|
||||||
|
try:
|
||||||
|
E(div, 'img', src=images['main']['url']['canonical'])
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
text = data['text']
|
||||||
|
for node in text:
|
||||||
|
process_node(node, article)
|
||||||
|
|
||||||
|
|
||||||
def classes(classes):
|
def classes(classes):
|
||||||
@ -79,6 +127,7 @@ class Economist(BasicNewsRecipe):
|
|||||||
resolve_internal_links = True
|
resolve_internal_links = True
|
||||||
remove_tags = [
|
remove_tags = [
|
||||||
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']),
|
dict(name=['script', 'noscript', 'title', 'iframe', 'cf_floatingcontent', 'aside', 'footer']),
|
||||||
|
dict(attrs={'aria-label': "Article Teaser"}),
|
||||||
dict(attrs={
|
dict(attrs={
|
||||||
'class': [
|
'class': [
|
||||||
'dblClkTrk', 'ec-article-info', 'share_inline_header',
|
'dblClkTrk', 'ec-article-info', 'share_inline_header',
|
||||||
@ -92,7 +141,10 @@ class Economist(BasicNewsRecipe):
|
|||||||
),
|
),
|
||||||
dict(attrs={
|
dict(attrs={
|
||||||
'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}),
|
'class': lambda x: x and 'blog-post__siblings-list-aside' in x.split()}),
|
||||||
classes('share-links-header teaser--wrapped latest-updates-panel__container latest-updates-panel__article-link blog-post__section newsletter-form')
|
classes(
|
||||||
|
'share-links-header teaser--wrapped latest-updates-panel__container'
|
||||||
|
' latest-updates-panel__article-link blog-post__section newsletter-form blog-post__bottom-panel'
|
||||||
|
)
|
||||||
]
|
]
|
||||||
keep_only_tags = [dict(name='article', id=lambda x: not x)]
|
keep_only_tags = [dict(name='article', id=lambda x: not x)]
|
||||||
no_stylesheets = True
|
no_stylesheets = True
|
||||||
@ -140,13 +192,15 @@ class Economist(BasicNewsRecipe):
|
|||||||
return br
|
return br
|
||||||
|
|
||||||
def preprocess_raw_html(self, raw, url):
|
def preprocess_raw_html(self, raw, url):
|
||||||
import html5lib
|
# open('/t/raw.html', 'wb').write(raw.encode('utf-8'))
|
||||||
root = html5lib.parse(raw, namespaceHTMLElements=False, treebuilder='lxml')
|
root = parse(raw)
|
||||||
from lxml import etree
|
script = root.xpath('//script[@id="__NEXT_DATA__"]')
|
||||||
|
if script:
|
||||||
|
load_article_from_json(script[0].text, root)
|
||||||
for div in root.xpath('//div[@class="lazy-image"]'):
|
for div in root.xpath('//div[@class="lazy-image"]'):
|
||||||
noscript = list(div.iter('noscript'))
|
noscript = list(div.iter('noscript'))
|
||||||
if noscript and noscript[0].text:
|
if noscript and noscript[0].text:
|
||||||
img = list(html5lib.parse(noscript[0].text, namespaceHTMLElements=False, treebuilder='lxml').iter('img'))
|
img = list(parse(noscript[0].text).iter('img'))
|
||||||
if img:
|
if img:
|
||||||
p = noscript[0].getparent()
|
p = noscript[0].getparent()
|
||||||
idx = p.index(noscript[0])
|
idx = p.index(noscript[0])
|
||||||
@ -171,7 +225,7 @@ class Economist(BasicNewsRecipe):
|
|||||||
|
|
||||||
def parse_index(self):
|
def parse_index(self):
|
||||||
# return [('Articles', [{'title':'test',
|
# return [('Articles', [{'title':'test',
|
||||||
# 'url':'http://www.economist.com/news/business/21718916-worlds-biggest-software-firm-has-transformed-its-culture-better-getting-cloud'
|
# 'url':'file:///t/raw.html'
|
||||||
# }])]
|
# }])]
|
||||||
raw = self.index_to_soup(self.INDEX, raw=True)
|
raw = self.index_to_soup(self.INDEX, raw=True)
|
||||||
# with open('/t/raw.html', 'wb') as f:
|
# with open('/t/raw.html', 'wb') as f:
|
||||||
|
Loading…
x
Reference in New Issue
Block a user