From 45d114bc322f83b7e7e8a21f1cc298c052218d33 Mon Sep 17 00:00:00 2001 From: Kovid Goyal Date: Thu, 14 Feb 2008 05:13:03 +0000 Subject: [PATCH] Add profiles for Washington Post and United Press International --- src/libprs500/ebooks/lrf/web/__init__.py | 6 ++- src/libprs500/ebooks/lrf/web/profiles/upi.py | 36 +++++++++++++++ .../ebooks/lrf/web/profiles/wash_post.py | 44 +++++++++++++++++++ 3 files changed, 84 insertions(+), 2 deletions(-) create mode 100644 src/libprs500/ebooks/lrf/web/profiles/upi.py create mode 100644 src/libprs500/ebooks/lrf/web/profiles/wash_post.py diff --git a/src/libprs500/ebooks/lrf/web/__init__.py b/src/libprs500/ebooks/lrf/web/__init__.py index 7b38f51867..0481618836 100644 --- a/src/libprs500/ebooks/lrf/web/__init__.py +++ b/src/libprs500/ebooks/lrf/web/__init__.py @@ -34,13 +34,15 @@ from libprs500.ebooks.lrf.web.profiles.ap import AssociatedPress from libprs500.ebooks.lrf.web.profiles.newyorker import NewYorker from libprs500.ebooks.lrf.web.profiles.jutarnji import Jutarnji from libprs500.ebooks.lrf.web.profiles.usatoday import USAToday +from libprs500.ebooks.lrf.web.profiles.upi import UnitedPressInternational +from libprs500.ebooks.lrf.web.profiles.wash_post import WashingtonPost builtin_profiles = [Atlantic, AssociatedPress, Barrons, BBC, ChristianScienceMonitor, CNN, Dilbert, Economist, FazNet, JerusalemPost, Jutarnji, Newsweek, NewYorker, - NewYorkReviewOfBooks, NYTimes, USAToday, + NewYorkReviewOfBooks, NYTimes, UnitedPressInternational, USAToday, Portfolio, Reuters, SpiegelOnline, WallStreetJournal, - ZeitNachrichten, + WashingtonPost, ZeitNachrichten, ] available_profiles = [i.__module__.rpartition('.')[2] for i in builtin_profiles] \ No newline at end of file diff --git a/src/libprs500/ebooks/lrf/web/profiles/upi.py b/src/libprs500/ebooks/lrf/web/profiles/upi.py new file mode 100644 index 0000000000..ab9f0acbc4 --- /dev/null +++ b/src/libprs500/ebooks/lrf/web/profiles/upi.py @@ -0,0 +1,36 @@ +import re +from libprs500.ebooks.lrf.web.profiles import DefaultProfile + + +class UnitedPressInternational(DefaultProfile): + + title = 'United Press International' + max_recursions = 2 + max_articles_per_feed = 15 + html2lrf_options = ['--override-css= "H1 {font-family: Arial; font-weight: bold; color: #000000; size: 10pt;}"'] + + + preprocess_regexps = [ (re.compile(i[0], re.IGNORECASE | re.DOTALL), i[1]) for i in + [ + (r'.*?' , lambda match : ''), + (r'
.*?', lambda match : ''), + (r'.*?', lambda match : ''), + (r'.*?', lambda match : ''), + (r'.*?', lambda match : ''), + ##(r'.*?
', lambda match : ''), + (r'', lambda match : ''), + (r'.*?.correction {', lambda match : '