Update Bloomberg Businessweek

This commit is contained in:
unkn0w7n 2024-06-29 12:15:18 +05:30
parent d52da5b931
commit 59f697c0d5
2 changed files with 25 additions and 24 deletions

View File

@ -2,8 +2,9 @@ import json
import random import random
import time import time
from calibre.web.feeds.news import BasicNewsRecipe, classes from calibre.web.feeds.news import BasicNewsRecipe, classes, prefixed_classes
from html5_parser import parse from html5_parser import parse
from collections import defaultdict
def get_contents(x): def get_contents(x):
@ -106,30 +107,29 @@ class Bloomberg(BasicNewsRecipe):
self.log('Downloading ', edition) self.log('Downloading ', edition)
self.cover_url = bw.find('img')['src'].replace('25x19', '600x800') self.cover_url = bw.find('img')['src'].replace('25x19', '600x800')
soup = self.index_to_soup(edition) soup = self.index_to_soup(edition)
if timefmt := soup.find(attrs={'class':lambda x: x and x.startswith('styles_MagazineTitle__')}): if timefmt := soup.find(**prefixed_classes('styles_TableOfContentsTitle__')):
self.timefmt = ' [' + (self.tag_to_string(timefmt).replace(' Issue', '')).strip() + ']' self.timefmt = ' [' + (self.tag_to_string(timefmt).replace(' Issue', '')).strip() + ']'
feeds = [] feeds_dict = defaultdict(list)
for div in soup.findAll(attrs={'class':lambda x: x and x.startswith(
('styles_MagazineFeatures__', 'styles_MagazineStoryList__') sec = ''
)}): toc = soup.find('section', attrs={'id':'toc-archive-businessweek'})
h3 = div.find(attrs={'class':lambda x: x and x.startswith( for div in toc.findAll(**prefixed_classes('MagazinePageMagazineArchive_itemContainer__')):
('styles_featuresTitle__', 'styles_magazineSectionTitle__') h3 = div.find(**prefixed_classes('MagazinePageMagazineArchive_itemSection__'))
)}) if h3 and h3.text:
sec = self.tag_to_string(h3) sec = self.tag_to_string(h3)
self.log(sec) self.log(sec)
articles = [] articles = []
for art in div.findAll(attrs={'data-component':'headline'}): a = div.find(**prefixed_classes('MagazinePageMagazineArchive_storyLink__'))
a = art.find('a', href=True)
url = a['href'] url = a['href']
if url.startswith('http') is False: if url.startswith('http') is False:
url = 'https://www.bloomberg.com' + a['href'] url = 'https://www.bloomberg.com' + a['href']
title = self.tag_to_string(a) title = self.tag_to_string(a)
articles.append({'title': title, 'url': url}) byl = div.find(**prefixed_classes('Byline_phoenix__'))
self.log('\t', title, '\n\t\t', url) desc = self.tag_to_string(byl)
if articles: self.log('\t', title, '\n\t', desc, '\n\t\t', url)
feeds.append((sec, articles)) feeds_dict[sec].append({"title": title, "url": url, "description": desc})
return feeds return [(sec, articles) for sec, articles in feeds_dict.items()]
def preprocess_raw_html(self, raw, *a): def preprocess_raw_html(self, raw, *a):
root = parse(raw) root = parse(raw)

View File

@ -8,7 +8,7 @@ from calibre.web.feeds.news import BasicNewsRecipe
def re_html(y): def re_html(y):
if y: if y:
soup = BeautifulSoup(y.rstrip(), "html.parser") soup = BeautifulSoup(y.rstrip())
return soup.text return soup.text
def get_cont(x): def get_cont(x):
@ -56,6 +56,7 @@ class tls(BasicNewsRecipe):
data = json.loads(raw) data = json.loads(raw)
self.cover_url = data['featuredimage']['full_image'] + '?w600' self.cover_url = data['featuredimage']['full_image'] + '?w600'
self.timefmt = ' [' + data['issuedateline']['issuedate'] + ']' self.timefmt = ' [' + data['issuedateline']['issuedate'] + ']'
if data['issuedateline']['issuenumber']:
self.description = 'Issue ' + data['issuedateline']['issuenumber'] self.description = 'Issue ' + data['issuedateline']['issuenumber']
feeds = [] feeds = []