New recipe for The Daily Mail UK by RufusA. Fix #998 (HTML2LRF and empty headings)

This commit is contained in:
Kovid Goyal 2009-03-10 10:40:56 -07:00
parent aedb2cf959
commit 8ea72440c8
3 changed files with 38 additions and 1 deletions

View File

@ -99,6 +99,10 @@ class HTMLConverter(object, LoggingInterface):
# Replace common line break patterns with line breaks # Replace common line break patterns with line breaks
(re.compile(r'<p>(&nbsp;|\s)*</p>', re.IGNORECASE), lambda m: '<br />'), (re.compile(r'<p>(&nbsp;|\s)*</p>', re.IGNORECASE), lambda m: '<br />'),
# Replace empty headers with line breaks
(re.compile(r'<h[0-5]?>(&nbsp;|\s)*</h[0-5]?>',
re.IGNORECASE), lambda m: '<br />'),
# Replace entities # Replace entities
(re.compile(ur'&(\S+?);'), partial(entity_to_unicode, (re.compile(ur'&(\S+?);'), partial(entity_to_unicode,
exceptions=['lt', 'gt', 'amp'])), exceptions=['lt', 'gt', 'amp'])),

View File

@ -33,7 +33,7 @@ recipe_modules = ['recipe_' + r for r in (
'la_republica', 'physics_today', 'chicago_tribune', 'e_novine', 'la_republica', 'physics_today', 'chicago_tribune', 'e_novine',
'al_jazeera', 'winsupersite', 'borba', 'courrierinternational', 'al_jazeera', 'winsupersite', 'borba', 'courrierinternational',
'lamujerdemivida', 'soldiers', 'theonion', 'news_times', 'lamujerdemivida', 'soldiers', 'theonion', 'news_times',
'el_universal', 'mediapart', 'wikinews_en', 'ecogeek', 'el_universal', 'mediapart', 'wikinews_en', 'ecogeek', 'daily_mail',
)] )]
import re, imp, inspect, time, os import re, imp, inspect, time, os

View File

@ -0,0 +1,33 @@
from calibre.web.feeds.news import BasicNewsRecipe
class TheDailyMail(BasicNewsRecipe):
title = u'The Daily Mail'
oldest_article = 2
language = _('English')
author = 'RufusA'
simultaneous_downloads= 1
max_articles_per_feed = 50
extra_css = 'h1 {text-align: left;}'
remove_tags = [ dict(name='ul', attrs={'class':'article-icons-links'}) ]
remove_tags_after = dict(name='h3', attrs={'class':'social-links-title'})
remove_tags_before = dict(name='div', attrs={'id':'content'})
no_stylesheets = True
feeds = [
(u'Home', u'http://www.dailymail.co.uk/home/index.rss'),
(u'News', u'http://www.dailymail.co.uk/news/index.rss'),
(u'Sport', u'http://www.dailymail.co.uk/sport/index.rss'),
(u'TV and Showbiz', u'http://www.dailymail.co.uk/tvshowbiz/index.rss'),
(u'Femail', u'http://www.dailymail.co.uk/femail/index.rss'),
(u'Health', u'http://www.dailymail.co.uk/health/index.rss'),
(u'Science and Technology', u'http://www.dailymail.co.uk/sciencetech/index.rss'),
(u'Money', u'http://www.dailymail.co.uk/money/index.rss'),
(u'Property', u'http://www.dailymail.co.uk/property/index.rss'),
(u'Motoring', u'http://www.dailymail.co.uk/motoring/index.rss'),
(u'Travel', u'http://www.dailymail.co.uk/travel/index.rss')]
def print_version(self, url):
main = url.partition('?')[0]
return main + '?printingPage=true'