mirror of
https://github.com/kovidgoyal/calibre.git
synced 2025-06-23 15:30:45 -04:00
72 lines
2.5 KiB
Python
72 lines
2.5 KiB
Python
#!/usr/bin/env python2
|
|
# encoding: utf-8
|
|
|
|
from __future__ import with_statement
|
|
__license__ = 'GPL 3'
|
|
__copyright__ = 'zotzo'
|
|
__docformat__ = 'restructuredtext en'
|
|
|
|
from calibre.web.feeds.news import BasicNewsRecipe
|
|
|
|
|
|
def classes(classes):
|
|
q = frozenset(classes.split(' '))
|
|
return dict(
|
|
attrs={'class': lambda x: x and frozenset(x.split()).intersection(q)}
|
|
)
|
|
|
|
|
|
class NYTimesTechnology(BasicNewsRecipe):
|
|
title = 'New York Times Technology Beat'
|
|
language = 'en'
|
|
description = 'The latest in technology - Gadgetwise'
|
|
publisher = 'The New York Times'
|
|
category = 'Technology'
|
|
oldest_article = 14
|
|
max_articles_per_feed = 25
|
|
remove_empty_feeds = True
|
|
no_stylesheets = True
|
|
language = 'en'
|
|
feeds = [
|
|
(u'Gadgetwise', u'http://gadgetwise.blogs.nytimes.com/feed/'),
|
|
]
|
|
keep_only_tags = [
|
|
dict(id='story'),
|
|
]
|
|
remove_tags = [
|
|
dict(attrs={'aria-label':'tools'.split()}),
|
|
dict(attrs={'aria-label': lambda x: x and 'New York Times Logo' in x}),
|
|
dict(href='#site-content #site-index'.split()),
|
|
dict(attrs={'aria-hidden':'true'}),
|
|
dict(attrs={'data-videoid':True}),
|
|
dict(name='button meta link'.split()),
|
|
dict(id=lambda x: x and x.startswith('story-ad-')),
|
|
dict(name='head'),
|
|
dict(role='toolbar'),
|
|
dict(name='a', href=lambda x: x and '#story-continues-' in x),
|
|
dict(name='a', href=lambda x: x and '#whats-next' in x),
|
|
dict(id=lambda x: x and 'sharetools-' in x),
|
|
dict(id='newsletter-promo supported-by-ad bottom-wrapper'.split()),
|
|
classes('story-print-citation supported-by accessibility-ad-header visually-hidden bottom-of-article ad'),
|
|
dict(attrs={'class': lambda x: x and (
|
|
'SectionBar' in x or 'recirculation' in x or 'ResponsiveAd' in x or 'accessibility-visuallyHidden' in x or 'RelatedCoverage' in x)}),
|
|
]
|
|
|
|
def preprocess_html(self, soup):
|
|
# Add a space to the dateline
|
|
t = soup.find(**classes('dateline'))
|
|
if t is not None:
|
|
t.insert(0, ' ')
|
|
|
|
# Remove empty li tags
|
|
for li in soup.findAll('li', attrs={'class': lambda x: x and x.startswith('css-')}):
|
|
if not li.contents and not li.string:
|
|
li.extract()
|
|
|
|
# Ensure the headline is first
|
|
h1 = soup.find('h1', itemprop='headline')
|
|
if h1 is not None:
|
|
h1.extract()
|
|
soup.find('body').contents.insert(0, h1)
|
|
return soup
|