ga - Handles article titles enclosed in tags (issue 132)

This commit is contained in:
Gary Arnold 2018-03-05 15:51:13 -08:00
parent b32374ca7b
commit 32c88dbe5f

View File

@ -156,10 +156,14 @@ def absurl(url):
url = 'https://www.granta.com' + url url = 'https://www.granta.com' + url
return url return url
def stripstyle(tag): def stripstyle(tag):
if tag is not None: if tag is not None:
del tag['style'] del tag['style']
def get_innermost_string(tag):
while hasattr(tag, 'contents') and len(tag.contents) > 0 and tag.contents[0] is not None:
tag = tag.contents[0]
return str(tag).strip()
################################################################## ##################################################################
@ -265,16 +269,16 @@ class Granta(BasicNewsRecipe):
h1 = toc.find('h1') h1 = toc.find('h1')
h2 = toc.find('h2') h2 = toc.find('h2')
if h1.find('a') is not None and len(h1.find('a').contents) > 0 and h1.find('a').contents[0] is not None: if h1.find('a') is not None and len(h1.find('a').contents) > 0 and h1.find('a').contents[0] is not None:
title = str(h1.find('a').contents[0]).strip() title = get_innermost_string(h1.find('a').contents[0])
elif len(h1.contents) > 0 and h1.contents[0] is not None: elif len(h1.contents) > 0 and h1.contents[0] is not None:
title = h1.contents[0] title = get_innermost_string(h1.contents[0])
else: else:
title = '' title = ''
if h2.find('a') is not None and len(h2.find('a').contents) > 0 and h2.find('a').contents[0] is not None: if h2.find('a') is not None and len(h2.find('a').contents) > 0 and h2.find('a').contents[0] is not None:
author = str(h2.find('a').contents[0]).strip() author = get_innermost_string(h2.find('a').contents[0])
title = title + u' (%s)' % author title = title + u' (%s)' % author
elif len(h2.contents) > 0 and h2.contents[0] is not None: elif len(h2.contents) > 0 and h2.contents[0] is not None:
author = h2.contents[0] author = get_innermost_string(h2.contents[0])
title = title + u' (%s)' % author title = title + u' (%s)' % author
else: else:
author = '' author = ''