diff --git a/recipes/live_law.recipe b/recipes/live_law.recipe index 1e2dc56c8e..58bf6b4943 100644 --- a/recipes/live_law.recipe +++ b/recipes/live_law.recipe @@ -35,6 +35,7 @@ class livelaw(BasicNewsRecipe): ] remove_tags = [ + classes('in-image-ad-wrap'), dict( name='div', attrs={'id': lambda x: x and x.startswith('inside_post_content_ad')} @@ -91,7 +92,7 @@ class livelaw(BasicNewsRecipe): def is_accepted_entry(self, entry): # Those sections in the top nav bar that we will omit omit_list = [ - 'videos', 'job-updates', 'events-corner', 'sponsored', 'hindi.livelaw.in' + 'videos', 'job-updates', 'events-corner', 'sponsored', 'hindi.livelaw.in', 'javascript:void(0);', ] is_accepted = True for omit_entry in omit_list: diff --git a/recipes/livemint.recipe b/recipes/livemint.recipe index 26cd09a29e..a112968816 100644 --- a/recipes/livemint.recipe +++ b/recipes/livemint.recipe @@ -13,7 +13,7 @@ class LiveMint(BasicNewsRecipe): title = u'Live Mint' description = 'Financial News from India.' language = 'en_IN' - __author__ = 'Krittika Goyal' + __author__ = 'Krittika Goyal, revised by unkn0wn' oldest_article = 1.15 # days max_articles_per_feed = 50 encoding = 'utf-8' @@ -48,6 +48,11 @@ class LiveMint(BasicNewsRecipe): ('How to Lounge','https://lifestyle.livemint.com/rss/how-to-lounge'), ('Smart Living','https://lifestyle.livemint.com/rss/smart-living'), ] + + def preprocess_html(self, soup): + for img in soup.findAll('img', attrs={'data-img': True}): + img['src'] = img['data-img'] + return soup else: # some wsj articles wont load extra_css = ''' @@ -90,31 +95,29 @@ class LiveMint(BasicNewsRecipe): ('Elections', 'https://www.livemint.com/rss/elections'), ] - def preprocess_raw_html(self, raw, *a): - if '' in raw: - m = re.search(r'type="application/ld\+json">[^<]+?"@type": "NewsArticle"', raw) - raw1 = raw[m.start():] - raw1 = raw1.split('>', 1)[1].strip() - data = json.JSONDecoder().raw_decode(raw1)[0] - value = data['hasPart']['value'] - body = data['articleBody'] + '
' + re.sub(r'([a-z]\.|[0-9]\.)([A-Z])', r'\1
\2', value) - body = '
' + body + '
'\ + + re.sub(r'(([a-z]|[^A-Z])\.|\.”)([A-Z]|“[A-Z])', r'\1
\3', value) + body = '
' + body + '