Update The Independent

This commit is contained in:
Kovid Goyal 2012-12-14 20:55:21 +05:30
parent 44636d61e7
commit 358472ff10

View File

@ -47,9 +47,10 @@ class TheIndependentNew(BasicNewsRecipe):
dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}), dict(name='img',attrs={'alt' : ['Get Adobe Flash player']}),
dict(name='img',attrs={'alt' : ['view gallery']}), dict(name='img',attrs={'alt' : ['view gallery']}),
dict(attrs={'style' : re.compile('.*')}), dict(attrs={'style' : re.compile('.*')}),
dict(attrs={'class':lambda x: x and 'voicesRelatedTopics' in x.split()}),
] ]
keep_only_tags =[dict(attrs={'id':'main'})] keep_only_tags =[dict(attrs={'id':['main','top']})]
recursions = 0 recursions = 0
# fixes non compliant html nesting and 'marks' article graphics links # fixes non compliant html nesting and 'marks' article graphics links
@ -69,7 +70,7 @@ class TheIndependentNew(BasicNewsRecipe):
} }
extra_css = """ extra_css = """
h1{font-family: Georgia,serif } h1{font-family: Georgia,serif ; font-size: x-large; }
body{font-family: Verdana,Arial,Helvetica,sans-serif} body{font-family: Verdana,Arial,Helvetica,sans-serif}
img{margin-bottom: 0.4em; display:block} img{margin-bottom: 0.4em; display:block}
.starRating img {float: left} .starRating img {float: left}
@ -77,16 +78,18 @@ class TheIndependentNew(BasicNewsRecipe):
.image {clear:left; font-size: x-small; color:#888888;} .image {clear:left; font-size: x-small; color:#888888;}
.articleByTimeLocation {font-size: x-small; color:#888888; .articleByTimeLocation {font-size: x-small; color:#888888;
margin-bottom:0.2em ; margin-top:0.2em ; display:block} margin-bottom:0.2em ; margin-top:0.2em ; display:block}
.subtitle {clear:left} .subtitle {clear:left ;}
.column-1 h1 { color: #191919} .column-1 h1 { color: #191919}
.column-1 h2 { color: #333333} .column-1 h2 { color: #333333}
.column-1 h3 { color: #444444} .column-1 h3 { color: #444444}
.column-1 p { color: #777777} .subtitle { color: #777777; font-size: medium;}
.column-1 p,a,h1,h2,h3 { margin: 0; } .column-1 a,h1,h2,h3 { margin: 0; }
.column-1 div{color:#888888; margin: 0;} .column-1 div{color:#888888; margin: 0;}
.articleContent {display: block; clear:left;} .articleContent {display: block; clear:left;}
.articleContent p {color: #000000; font-size: medium;}
.storyTop{} .storyTop{}
.pictureContainer img { max-width: 400px; max-height: 400px;} .pictureContainer img { max-width: 400px; max-height: 400px;}
.image img { max-width: 400px; max-height: 400px;}
""" """
oldest_article = 1 oldest_article = 1
@ -325,6 +328,20 @@ class TheIndependentNew(BasicNewsRecipe):
item.contents[0] = '' item.contents[0] = ''
def postprocess_html(self,soup, first_fetch): def postprocess_html(self,soup, first_fetch):
#mark subtitle parent as non-compliant nesting causes
# p's to be 'popped out' of the h3 tag they are nested in.
subtitle = soup.find('h3', attrs={'class' : 'subtitle'})
subtitle_div = None
if subtitle:
subtitle_div = subtitle.parent
if subtitle_div:
clazz = ''
if 'class' in subtitle_div:
clazz = subtitle_div['class'] + ' '
clazz = clazz + 'subtitle'
subtitle_div['class'] = clazz
#find broken images and remove captions #find broken images and remove captions
items_to_extract = [] items_to_extract = []
for item in soup.findAll('div', attrs={'class' : 'image'}): for item in soup.findAll('div', attrs={'class' : 'image'}):