This commit is contained in:
Kovid Goyal 2024-09-18 08:27:58 +05:30
parent c7f0e65fa7
commit 7597538345
No known key found for this signature in database
GPG Key ID: 06BC317B515ACE7C
3 changed files with 9 additions and 8 deletions

View File

@ -6,12 +6,13 @@ https://www.economist.com/the-world-in-brief
import json import json
from urllib.parse import quote, urlencode from urllib.parse import quote, urlencode
from html5_parser import parse
from lxml import etree
from calibre import replace_entities from calibre import replace_entities
from calibre.ebooks.BeautifulSoup import NavigableString, Tag from calibre.ebooks.BeautifulSoup import NavigableString, Tag
from calibre.ptempfile import PersistentTemporaryFile from calibre.ptempfile import PersistentTemporaryFile
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
from html5_parser import parse
from lxml import etree
def E(parent, name, text='', **attrs): def E(parent, name, text='', **attrs):

View File

@ -8,6 +8,7 @@ engadget.com
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
def classes(classes): def classes(classes):
q = frozenset(classes.split(' ')) q = frozenset(classes.split(' '))
return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}) return dict(attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)})
@ -30,14 +31,14 @@ class Engadget(BasicNewsRecipe):
scale_news_images_to_device = True scale_news_images_to_device = True
cover_url = 'https://upload.wikimedia.org/wikipedia/commons/b/bb/Engadget-logo.svg' cover_url = 'https://upload.wikimedia.org/wikipedia/commons/b/bb/Engadget-logo.svg'
keep_only_tags = [ keep_only_tags = [
classes('caas-content-wrapper caas-title-wrapper'), classes('caas-content-wrapper caas-title-wrapper'),
dict(name='figure') dict(name='figure')
] ]
remove_tags = [ remove_tags = [
dict(name='div', attrs={'class':'caas-content-byline-wrapper'}), dict(name='div', attrs={'class':'caas-content-byline-wrapper'}),
dict(name='div', attrs={'data-component':'ArticleAuthorInfo'}), dict(name='div', attrs={'data-component':'ArticleAuthorInfo'}),
classes('commerce-module caas-header caas-prestige-bottom-share caas-share-buttons caas-da caas-3p-blocked commerce-disclaimer notification-upsell-push article-slideshow athena-button email-form') classes('commerce-module caas-header caas-prestige-bottom-share caas-share-buttons caas-da caas-3p-blocked commerce-disclaimer notification-upsell-push article-slideshow athena-button email-form') # noqa
] ]
feeds = [(u'Posts', u'https://www.engadget.com/rss.xml')] feeds = [(u'Posts', u'https://www.engadget.com/rss.xml')]
@ -86,7 +87,6 @@ class Engadget(BasicNewsRecipe):
except KeyError: except KeyError:
continue continue
# Reorder the "title" and "content" elements # Reorder the "title" and "content" elements
body_tag = soup.find('body')
title_div = soup.find("div", {"class": "caas-title-wrapper"}) title_div = soup.find("div", {"class": "caas-title-wrapper"})
content_div = soup.find("div", {"class": "caas-content-wrapper"}) content_div = soup.find("div", {"class": "caas-content-wrapper"})
if title_div and content_div: if title_div and content_div:
@ -94,4 +94,3 @@ class Engadget(BasicNewsRecipe):
soup.body.append(title_div) soup.body.append(title_div)
soup.body.append(content_div) soup.body.append(content_div)
return soup return soup

View File

@ -1,8 +1,9 @@
#!/usr/bin/env python #!/usr/bin/env python
import re
import json import json
import re
import time import time
from datetime import datetime, timedelta from datetime import datetime, timedelta
from calibre.web.feeds.news import BasicNewsRecipe from calibre.web.feeds.news import BasicNewsRecipe
@ -94,7 +95,7 @@ def parse_types(x):
yield '<p>' + ''.join(parse_cnt(x)) + '</p>' yield '<p>' + ''.join(parse_cnt(x)) + '</p>'
elif x.get('__typename', '') == 'BylineBlock': elif x.get('__typename', '') == 'BylineBlock':
yield '<div class="byl"><br/>' + ''.join(parse_byline(x)) + '</div>' yield '<div class="byl"><br/>' + ''.join(parse_byline(x)) + '</div>'
elif x.get('__typename', '') == 'LabelBlock': elif x.get('__typename', '') == 'LabelBlock':
yield '<div class="sc">' + ''.join(parse_cnt(x)) + '</div>' yield '<div class="sc">' + ''.join(parse_cnt(x)) + '</div>'
elif x.get('__typename', '') == 'BlockquoteBlock': elif x.get('__typename', '') == 'BlockquoteBlock':