mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-07-09 03:04:10 -04:00
Merge branch 'master' of https://github.com/unkn0w7n/calibre
This commit is contained in:
commit
06752c03cc
@ -1,6 +1,7 @@
|
|||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
from urllib.parse import quote
|
from urllib.parse import quote
|
||||||
|
from html5_parser import parse
|
||||||
|
|
||||||
from calibre.web.feeds.news import BasicNewsRecipe
|
from calibre.web.feeds.news import BasicNewsRecipe
|
||||||
|
|
||||||
@ -16,9 +17,29 @@ class ft(BasicNewsRecipe):
|
|||||||
remove_javascript = True
|
remove_javascript = True
|
||||||
remove_empty_feeds = True
|
remove_empty_feeds = True
|
||||||
ignore_duplicate_articles = {'url'}
|
ignore_duplicate_articles = {'url'}
|
||||||
|
resolve_internal_links = True
|
||||||
remove_attributes = ['style', 'width', 'height']
|
remove_attributes = ['style', 'width', 'height']
|
||||||
masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg'
|
masthead_url = 'https://im.ft-static.com/m/img/masthead_main.jpg'
|
||||||
extra_css = '#fig-cap{font-style:italic; text-align:left; font-size:small;}'
|
|
||||||
|
extra_css = '''
|
||||||
|
.article-info__time-byline {font-size:small; font-weight:bold; }
|
||||||
|
.o-topper__visual, #fig, .main-image, .n-content-image { text-align:center; font-size:small; }
|
||||||
|
blockquote, i { color:#5c5c5c; }
|
||||||
|
.o-topper__standfirst { font-weight:bold; color:#202020; }
|
||||||
|
.o-topper__topic { font-size:small; color:#5c5c5c; }
|
||||||
|
'''
|
||||||
|
|
||||||
|
keep_only_tags = [
|
||||||
|
classes(
|
||||||
|
'body_json o-topper__topic o-topper__headline o-topper__standfirst o-topper__visual article-info__time-byline main-image'
|
||||||
|
),
|
||||||
|
dict(name='article', attrs={'id':'article-body'})
|
||||||
|
]
|
||||||
|
|
||||||
|
remove_tags = [
|
||||||
|
dict(name='aside', attrs={'class':'n-content-recommended--single-story'}),
|
||||||
|
classes('in-article-advert')
|
||||||
|
]
|
||||||
|
|
||||||
# needs_subscription = 'optional'
|
# needs_subscription = 'optional'
|
||||||
#
|
#
|
||||||
@ -40,6 +61,10 @@ class ft(BasicNewsRecipe):
|
|||||||
br.set_current_header('Referer', 'https://www.google.com/')
|
br.set_current_header('Referer', 'https://www.google.com/')
|
||||||
return br
|
return br
|
||||||
|
|
||||||
|
# the print_version loads all articles but sometimes it might fail due to too many requests
|
||||||
|
# def print_version(self, url):
|
||||||
|
# return 'https://webcache.googleusercontent.com/search?q=cache:' + quote(url, safe='')
|
||||||
|
|
||||||
def get_cover_url(self):
|
def get_cover_url(self):
|
||||||
from datetime import date
|
from datetime import date
|
||||||
cover = 'http://img.kiosko.net/' + str(
|
cover = 'http://img.kiosko.net/' + str(
|
||||||
@ -74,6 +99,11 @@ class ft(BasicNewsRecipe):
|
|||||||
def preprocess_raw_html(self, raw, *a):
|
def preprocess_raw_html(self, raw, *a):
|
||||||
# with open('/t/raw.html', 'w') as f:
|
# with open('/t/raw.html', 'w') as f:
|
||||||
# f.write(raw)
|
# f.write(raw)
|
||||||
|
root = parse(raw)
|
||||||
|
if x := root.xpath('//article[@id="article-body"]'):
|
||||||
|
self.log('**has article content')
|
||||||
|
return raw
|
||||||
|
self.log('**no article content')
|
||||||
m = re.search(r'type="application/ld\+json">[^<]+?"@type":"NewsArticle"', raw)
|
m = re.search(r'type="application/ld\+json">[^<]+?"@type":"NewsArticle"', raw)
|
||||||
raw = raw[m.start():]
|
raw = raw[m.start():]
|
||||||
raw = raw.split('>', 1)[1]
|
raw = raw.split('>', 1)[1]
|
||||||
@ -114,12 +144,11 @@ class ft(BasicNewsRecipe):
|
|||||||
body = re.sub(r'\[https://\S+?\]', insert_image, body)
|
body = re.sub(r'\[https://\S+?\]', insert_image, body)
|
||||||
if data.get('description'):
|
if data.get('description'):
|
||||||
desc = '<h2>' + data['description'] + '</h2>'
|
desc = '<h2>' + data['description'] + '</h2>'
|
||||||
html = '<html><body><h1>' + title + '</h1>' + desc + '<h3>' + author + '</h3>' + image + '<p>' + body
|
html = '<html><body><div class="body_json"><h1>' + title + '</h1>' + desc + '<h3>' + author + '</h3>' + image + '<p>' + body
|
||||||
return html
|
return html
|
||||||
|
|
||||||
def preprocess_html(self, soup):
|
def preprocess_html(self, soup):
|
||||||
for span in soup.findAll('span'):
|
for con in soup.findAll(attrs={'class':'n-content-layout__slot'}):
|
||||||
p = span.findParent('p')
|
if con.find('figure'):
|
||||||
if p:
|
con['id'] = 'fig'
|
||||||
p['id'] = 'fig-cap'
|
|
||||||
return soup
|
return soup
|
||||||
|
Loading…
x
Reference in New Issue
Block a user