Updated Sun, Mirror Metro

2025-07-09 03:04:10 -04:00 · 2012-04-12 14:29:31 +05:30 · 2012-04-12 14:29:31 +05:30 · b24bde150d
commit b24bde150d
parent 50e419efd2
3 changed files with 92 additions and 57 deletions
--- a/recipes/daily_mirror.recipe
+++ b/recipes/daily_mirror.recipe
@ -1,20 +1,21 @@
+
 from calibre.web.feeds.news import BasicNewsRecipe
 import re
+import mechanize
 class AdvancedUserRecipe1306061239(BasicNewsRecipe):
    title          = u'The Daily Mirror'
    description = 'News as provide by The Daily Mirror -UK'

    __author__ = 'Dave Asbury'
-    # last updated 11/2/12
+    # last updated 7/4/12
    language = 'en_GB'
-
-    cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
+    #cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'

    masthead_url = 'http://www.nmauk.co.uk/nma/images/daily_mirror.gif'


    oldest_article = 1
-    max_articles_per_feed = 5
+    max_articles_per_feed = 10
    remove_empty_feeds = True
    remove_javascript     = True
    no_stylesheets = True
@ -75,3 +76,28 @@ class AdvancedUserRecipe1306061239(BasicNewsRecipe):
                    img { display:block}
                	 '''

+    def get_cover_url(self):
+        soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
+        # look for the block containing the mirror button and url
+        cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_92.gif);'})
+        cov2 = str(cov)
+        cov2='http://www.politicshome.com'+cov2[9:-142]
+        #cov2 now contains url of the page containing pic
+        soup = self.index_to_soup(cov2)
+        cov = soup.find(attrs={'id' : 'large'})
+        cov2 = str(cov)
+        cov2=cov2[27:-18]
+        #cov2 now is pic url, now  go back to original function
+        br = mechanize.Browser()
+        br.set_handle_redirect(False)
+        try:
+                br.open_novisit(cov2)
+                cover_url = cov2
+        except:
+                cover_url = 'http://yookeo.com/screens/m/i/mirror.co.uk.jpg'
+
+        #cover_url = cov2
+        #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
+        return cover_url
+
+
--- a/recipes/metro_uk.recipe
+++ b/recipes/metro_uk.recipe
@ -1,52 +1,30 @@
-import re
 from calibre.web.feeds.news import BasicNewsRecipe
 class AdvancedUserRecipe1306097511(BasicNewsRecipe):
    title          = u'Metro UK'
    description = 'News as provide by The Metro -UK'
-
+    #timefmt = ''
    __author__ = 'Dave Asbury'
-    #last update 3/12/11
    cover_url = 'http://profile.ak.fbcdn.net/hprofile-ak-snc4/276636_117118184990145_2132092232_n.jpg'
-    no_stylesheets = True
+    #no_stylesheets = True
    oldest_article = 1
-    max_articles_per_feed = 20
+    max_articles_per_feed = 10
    remove_empty_feeds = True
    remove_javascript     = True
+    auto_cleanup = True

-    #preprocess_regexps = [(re.compile(r'Tweet'), lambda  a : '')]
-    preprocess_regexps = [
-    (re.compile(r'<span class="img-cap legend">', re.IGNORECASE | re.DOTALL), lambda match: '<p></p><span class="img-cap legend"> ')]
-    preprocess_regexps = [
-    (re.compile(r'tweet', re.IGNORECASE | re.DOTALL), lambda match: '')]

    language = 'en_GB'
-
-
    masthead_url        = 'http://e-edition.metro.co.uk/images/metro_logo.gif'
-
-
    keep_only_tags = [
-    dict(name='h1'),dict(name='h2', attrs={'class':'h2'}),
-                    dict(attrs={'class':['img-cnt figure']}),
-        dict(attrs={'class':['art-img']}),
-                    dict(name='div', attrs={'class':'art-lft'}),
-                    dict(name='p')
+
    ]
    remove_tags    = [
-                             dict(name = 'div',attrs={'id' : ['comments-news','formSubmission']}),
-                             dict(name='div', attrs={'class':[ 'news m12 clrd clr-b p5t shareBtm', 'commentForm', 'metroCommentInnerWrap',
-                             'art-rgt','pluck-app pluck-comm','news m12 clrd clr-l p5t', 'flt-r','username','clrd' ]}),
-              dict(attrs={'class':['username', 'metroCommentFormWrap','commentText','commentsNav','avatar','submDateAndTime','addYourComment','displayName']})
-                              ,dict(name='div', attrs={'class' : 'clrd art-fd fd-gr1-b'})
+
                               ]
+
+
    feeds          = [
        (u'News', u'http://www.metro.co.uk/rss/news/'), (u'Money', u'http://www.metro.co.uk/rss/money/'), (u'Sport', u'http://www.metro.co.uk/rss/sport/'), (u'Film', u'http://www.metro.co.uk/rss/metrolife/film/'), (u'Music', u'http://www.metro.co.uk/rss/metrolife/music/'), (u'TV', u'http://www.metro.co.uk/rss/tv/'), (u'Showbiz', u'http://www.metro.co.uk/rss/showbiz/'), (u'Weird News', u'http://www.metro.co.uk/rss/weird/'), (u'Travel', u'http://www.metro.co.uk/rss/travel/'), (u'Lifestyle', u'http://www.metro.co.uk/rss/lifestyle/'), (u'Books', u'http://www.metro.co.uk/rss/lifestyle/books/'), (u'Food', u'http://www.metro.co.uk/rss/lifestyle/restaurants/')]
-
    extra_css  = '''
-                    body {font: sans-serif medium;}'
-    h1 {text-align : center; font-family:Arial,Helvetica,sans-serif; font-size:20px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold;}
-                h2 {text-align : center;color:#4D4D4D;font-family:Arial,Helvetica,sans-serif; font-size:15px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:bold; }
-                    span{ font-size:9.5px; font-weight:bold;font-style:italic}
-                    p { text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
-
+	body{ text-align: justify; font-family:Arial,Helvetica,sans-serif; font-size:11px; font-size-adjust:none; font-stretch:normal; font-style:normal; font-variant:normal; font-weight:normal;}
                	 '''
--- a/recipes/the_sun.recipe
+++ b/recipes/the_sun.recipe
@ -1,9 +1,8 @@
-import re
+import re, mechanize
 from calibre.web.feeds.recipes import BasicNewsRecipe
 class AdvancedUserRecipe1325006965(BasicNewsRecipe):

    title          = u'The Sun UK'
-    cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'

    description = 'A Recipe for The Sun tabloid UK'
    __author__ = 'Dave Asbury'
@ -49,7 +48,6 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):


    feeds          = [
-	#(u'News', u'http://www.thesun.co.uk/sol/homepage/news/rss'),
            (u'News','http://feed43.com/2517447382644748.xml'),
            (u'Sport', u'http://feed43.com/4283846255668687.xml'),
            (u'Bizarre', u'http://feed43.com/0233840304242011.xml'),
@ -58,3 +56,36 @@ class AdvancedUserRecipe1325006965(BasicNewsRecipe):
            (u'Sun Woman',u'http://feed43.com/0022626854226453.xml'),
    ]

+    def get_cover_url(self):
+        soup = self.index_to_soup('http://www.politicshome.com/uk/latest_frontpage.html')
+        # look for the block containing the sun button and url
+        cov = soup.find(attrs={'style' : 'background-image: url(http://www.politicshome.com/images/sources/source_frontpage_button_84.gif);'})
+
+
+
+        #cov = soup.find(attrs={'id' : 'large'})
+        cov2 = str(cov)
+
+        cov2='http://www.politicshome.com'+cov2[9:-133]
+        #cov2 now contains url of the page containing pic
+
+        #cov2 now contains url of the page containing pic
+        soup = self.index_to_soup(cov2)
+        cov = soup.find(attrs={'id' : 'large'})
+        cov2 = str(cov)
+        cov2=cov2[27:-18]
+        #cov2 now is pic url, now  go back to original function
+
+        br = mechanize.Browser()
+        br.set_handle_redirect(False)
+        try:
+            br.open_novisit(cov2)
+            cover_url = cov2
+        except:
+            cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
+
+        #cover_url = cov2
+        #cover_url = 'http://www.thesun.co.uk/img/global/new-masthead-logo.png'
+        return cover_url
+
+