#############################
# Probability Distributions
# QERM 598: Lab 1
# Eli Gurarie
# January 8, 2008
#############################
# There is a family of functions related to probability distribution functions. They look like: "ddist", "pdist", "qdist", "rdist". We'll see how they work with examples:
# 1) "d" stands for density
dnorm(0) # yields the value of the standard normal (0,1) pdf (which is equal to ....)
dnorm(0,mean=1,sd=2) # change parameter vales
# one way to plot this function
X <- seq(-4,6,.01) # create a vector of x-values
Y1 <- dnorm(X) # apply the function to the x-values
Y2 <- dnorm(X,1,2) # Try different values for the mean and variance
#Note that you don't actually have to specify "mean=" and "sd="
plot(X,Y1,type="l")
lines(X,Y2) # adds a line to a given plot
lines(X,Y2,col=2) #if you prefer
# some basic cosmetic features of "plot"
plot(X,Y1,type="l",xlab="X",ylab="Density",
main="Plots of Normal Distribution density functions")
lines(X,Y2,col=2)
# A shortcut to drawing simple functions
curve(dnorm(x))
curve(dnorm(x,1,2),add=T,col=2) #"add=T" means don't draw a new plot
# In-Class Assignment 1: Look up the "integrate" function (using "?") and confirm that "dnorm" yields a legitimate distribution.
# 2) "p" stands for probability
pnorm(0) # gives the value of the cdf, i.e. the probability that a N(0,1) variable is *less* than 0
pnorm(0,1,2)
curve(pnorm(x))
curve(pnorm(x),xlim=c(-3,6)) # control limits
curve(pnorm(x,1,2),add=T,col=2) # "add=T" means don't redraw plot, "col=2" is red
# Consider a commonly encountered number:
pnorm(-1.96)
pnorm(1.96)
# compare these:
pnorm(1.96,lower.tail=F)
1-pnorm(1.96)
# What's the "p"-value of this number in a different distribution?
pnorm(-1.96,1,2)
pnorm(1.96,1,2)
# Save these values
lo1 <- pnorm(-1.96)
hi1 <- pnorm(1.96)
lo2 <- pnorm(-1.96,1,2)
hi2 <- pnorm(1.96,1,2)
# lets add some points to our distribution plots
points(c(-1.96,1.96),c(lo1,hi1))
points(c(-1.96,1.96),c(lo2,hi2),col=2,pch=2) # pch = point style. See bottom of "?point" for details
# In-Class Assignment 2: Use the "integrate" function to emulate the "pnorm" function.
# 3) "q" stands for quantile
qnorm(0.5) # returns the value for a N(0,1) variable which corresponds to the 0.5 quantile or 50'th percentile
qnorm(0.025)
qnorm(0.975)
#so what's the 95% interval around distribution 2?
qnorm(0.025,1,2)
qnorm(0.975,1,2)
# illustrate the intervals
Interval1<-qnorm(c(0.025,0.975))
Interval2<-qnorm(c(0.025,0.975),1,2)
curve(dnorm(x),xlim=c(-4,6))
curve(dnorm(x,1,2),add=T,col=2)
abline(v=Interval1,lty=2) # "abline" adds straight lines to a plot, usually with intercept (a) and slope (b). "abline(v=..)" is a quick way to add vertical lines. "lty=2" is a dashed line-type.
abline(v=Interval2,lty=2,col=2)
#Let's add horizontal lines to see what the value of the density functions are at these intervals
d1<-dnorm(qnorm(0.025))
d2<-dnorm(qnorm(0.025,1,2),1,2)
abline(h=d1,lty=2)
abline(h=d2,lty=2,col=2)
# This is a little too busy! Split up the plotting window
par(mfrow=c(2,1)) # par is a very very powerful and overloaded function which sets the parameters for subsequent graphics: margins, shadings, labels, colors, axis shapes, and on and on. For now all you need to know is that the parameter "mfrow=c(nrows,ncols)" breaks the plotting window into a certain number of rows and columns.
curve(dnorm(x))
abline(v=Interval1,lty=2)
abline(h=d1,lty=2)
curve(dnorm(x,1,2),col=2)
abline(v=Interval2,col=2,lty=2)
abline(h=d2,col=2,lty=2)
# It is pretty easy to do this with any number of quantiles
Ps <- seq(0,1,.05)
Xs <- qnorm(Ps)
Ds <- dnorm(Xs)
curve(dnorm(x),xlim=c(-3,3),lwd=2)
abline(v=Xs)
abline(h=Ds)
par(mfrow=c(1,1))
# let's get a little bit fancy
curve(dnorm(x),xlim=c(-2.5,2.5),lwd=2,col="grey")
segments(x0=Xs,y0=0,x1=Xs,y1=Ds,col=3)
segments(x0=Xs,y0=Ds,x1=-Xs,y1=Ds,col=3)
# label the p-values with the text command
text(Xs,Ds+.01,Ps,cex=0.8)
# In-class Assignment 3: Adapt the code above making a similar plot for the gamma(2,3) distribution.
# 4) "r" stands for random
# This is the command that allows you to draw samples out of a distribution. It is a one-line piece of magic that allows you to "generate data" - a trick that tends to tick off all real scientists who spend hours and hours and millions of dollars in field sites and labs painstakingly acquiring the real stuff.
# Play with it!
rnorm(1)
rnorm(10)
X<-rnorm(100)
# Collect statistics
mean(X)
sd(X)
# If the "sd" command didn't exist, how would you estimate it "by hand" (in 30 characters or less)? Is "sd" really a good name for this statistic?
# More statistics and manipulations"
max(X)
min(X)
range(X)
sort(X)
order(X)
quantile(X)
# plot the raw data
plot(X)
# plot the sorted data
plot(sort(X))
# plot a quantile-quantile plot
qqnorm(X)
# this is a very important diagnostic plot you will revisit often later
# draw a histogram of the data
hist(X)
# Can we superimpose the known distribution?
curve(dnorm(x),add=T) # doesn't work!
# Two possible solutions. #1:
hist(X,freq=F) # plot density, not frequency
curve(dnorm(x),add=T,col=2)
# Compare it to the "theoretical" distribution
curve(dnorm(x,mean(X),sd(X)),add=T,col=3)
# A little prettier:
hist(X,freq=F,col="lightgrey",border="darkgrey",breaks=30)
curve(dnorm(x),add=T,col=2,lwd=1.5)
curve(dnorm(x,mean(X),sd(X)),add=T,col=3,lwd=1.5)
# maybe now's a good time to draw a legend
legend("topleft", # legend placement (instead of "x,y" coords)
legend=c("known distribution","estimated distribution"), # legend text
col=2:3, lty=1) # Draw lines: one red, one green
# In-class assignment: Make a similar plot for the gamma(2,3) distribution!
# This much is easy. But how do we estimate the parameters of a gamma distribution? See homework assignment!
# 5) Illustrating the central limit theorem
# The central limit states that the sum of n iid random variables with mean mu and variance sigma converges in distribution to a Normal(mean = n*mu, var = n*sigma^2). This is one of the most incredibly useful and powerful results since the Babylonians discovered that sweet and soggy barley spontaneously ferments into beer. We would like to visualize it as vividly as possible.
# Consider simulated data drawn from an exponential random variable
a <- rexp(1000,rate=1)
hist(a,breaks=40,freq=F)
curve(dnorm(x,mean=1,sd=1), # plot the CLT prediction
add=T,col=2,lwd=2)
# now we make a loop! (our first)
for(i in 1:30) # this says cycle 30 times
{
newa <- rexp(1000) # create new exponential data
a<-a+newa # sum it to the old data
hist(a,breaks=40,freq=F) # plot the puppy
curve(dnorm(x,mean=i*1,sd=sqrt(i)), # plot the CLT prediction
add=T,col=2,lwd=2)
}
# Whoa! that was cool, but way too fast (and crude).
# Here we have to do a quick something that is completely unintuitive
library(DAAG) # This is some random package in R that contains the function "pause()" which we want. What else it is for, I don't know. I don't remember how I found it. But the "pause()" function is great. This is an example of the somewhat arbitrary nature of R.
a <- rexp(1000,rate=1)
for(n in 1:30)
{
pause()
hist(a,breaks=40,freq=F,
main=paste("n=",n),col="grey") # note the "paste" function
curve(dnorm(x,mean=n,sd=sqrt(n)),
add=T,col=2,lwd=2)
newa <- rexp(1000)
a<-a+newa
}
# How about fixing the x limits
a <- rexp(1000,rate=1)
for(n in 1:30)
{
pause()
hist(a,breaks=40,freq=F,
main=paste("n=",n),col="grey",
xlim=c(0,40))
curve(dnorm(x,mean=n,sd=sqrt(n)),
add=T,col=2,lwd=2)
newa <- rexp(1000)
a<-a+newa
}
# How about a discrete distribution
p<-0.5
a <- rbinom(1000,1,p)
hist(a,breaks=40,freq=F,main="n=1") # plot the puppy
curve(dnorm(x,mean=p,sd=sqrt(p*(1-p))),
add=T,col=2,lwd=2)
for(n in 1:200)
{
pause()
hist(a,breaks=100,
main=paste("n=",n),col="grey")
curve(1000*dnorm(x,mean=n*p,sd=sqrt(n*p*(1-p))),
add=T,col=2,lwd=2)
newa <- rbinom(1000,1,p)
a<-a+newa
}
# How about a pathologically non-normal looking distribution?
p <-.99
a <- rbinom(1000,1,p)
hist(a,breaks=40,freq=F,main="n=1") # plot the puppy
for(n in 1:1000) # this says cycle 30 times
{
pause()
hist(a,breaks=100,
main=paste("n=",n),col="grey") # plot the puppy
curve(1000*dnorm(x,mean=n*p,sd=sqrt(n*p*(1-p))), # plot the CLT prediction
add=T,col=2,lwd=2)
newa <- rbinom(1000,1,p) # create new exponential data
a<-a+newa # sum it to the old data
}
##############################
# Some Numerical Experiments #
# (thanks to M.Keim) #
##############################
# Experiment 1: CLT
# create a blank vector to save simulation results
x.bars<-rep(NA,1000)
# create a loop to repeat the experiment
# "i" will index the iterations, starting at 1 and ending at 1000
for(i in 1:1000) # open loop
{
x<-rnorm(5,mean=0,sd=1) # simulate 5 standard normal variables
x.bars[i]<-mean(x) # store the sample mean of the 5 rv's
} # close the loop
# calculate the mean of our stored sample means
mean(x.bars) # should be near 0 (by E(X.bar) = mu)
# calculate the sample variance of the stored sample means
var(x.bars) # should be near 1/5 (by CLT)
# create a histogram of our sample means with 50 bins/breaks/intervals.
hist(x.bars,breaks=50,freq=F)
# draw a blue line (col="blue") that is double width (lwd=2) of the "kernel density" estimated for our sample means
lines(density(x.bars),col="blue",lwd=2)
# draw a double width red line of a N(0,1/5) density curve.
curve(dnorm(x,0,sqrt(1/5)),col="red",lwd=2,add=T)
# Exercise: Draw a green line of double width for a Normal curve having mean equal to the sample mean and standard deviation equal to the sample standard deviation.
# It's generally better to name and separate all your parameters at the top of your code
reps<-100
n<-3
mu<-0
sigma<-1
x.bars<-rep(0,reps)
for(i in 1:reps)
x.bars[i]<-mean(rnorm(n,mean=mu,sd=sigma))
mu.hat <- mean(x.bars) # should be near ___?
sigma.hat <- sd(x.bars) # should be near ___?
hist(x.bars,breaks=sqrt(reps),freq=F)
lines(density(x.bars),col="blue",lwd=2)
curve(dnorm(x,mu,sigma/sqrt(n)),col="red",lwd=2,add=T)
curve(dnorm(x,mu.hat,sigma.hat),col="green",lwd=2,add=T)
# re-run with n=30,100, num.sims=10^4
# Adapt the above code to Poisson and Uniform distributions.
#########################
# Experiment 2 with CLT #
#########################
# How does convergence to the CLT improve with sample size?
# Using a non-Gaussian example...
sizes<-c(2,5,10,30)
reps<-1000
r<-1 # exponential rate parameter
# we need to make an array (or matrix (or data.frame)) to put all of the
x.bars<-array(dim=c(1000,length(sizes)))
for(j in 1:(length(sizes)))
{
n<-sizes[j]
for(i in 1:reps)
x.bars[i,j]<-mean(rexp(n,rate=r))
}
# Plot each of these histograms
par(mfrow=c(2,2))
for(i in 1:length(sizes))
{
n<-sizes[i]
hist(x.bars[,i],breaks=50,freq=F,
main=paste("n =",n),
xlab="x.bar")
lines(density(x.bars[,i]),
col="blue",lwd=2)
curve(dnorm(x,mean=r,sd=1/sqrt(n)),
col="red",lwd=2,add=T)
}