############# Goodness of Fit Measures AND Maximum Likelihood Estimation & Data Problems (Code #5c) #############

#### Regression & R^2

### 3-factor FF Regression for IBM: Compute R^2 & Adj-R^2
SFX_da <- read.csv("http://www.bauer.uh.edu/rsusmel/4397/Stocks_FX_1973.csv", head=TRUE, sep=",")

x_ibm <- SFX_da$IBM				# extract IBM price data
x_Mkt_RF <- SFX_da$Mkt_RF			# extract Market excess returns (in %)
x_RF <- SFX_da$RF				# extract Risk-free rate (in %)
x_SMB <- SFX_da$SMB
x_HML <- SFX_da$HML

# Define log returns & adjust size of variables accordingly
T <- length(x_ibm)				# sample size
lr_ibm <- log(x_ibm[-1]/x_ibm[-T])		# create IBM log returns (in decimal returns)
Mkt_RF <- x_Mkt_RF[-1]/100			# Adjust sample size to ( T-1) by removing 1st obs 
RF <- x_RF[-1]/100				# Adjust sample size and use decimal returns.
SMB <- x_SMB[-1]/100
HML <- x_HML[-1]/100

y <- lr_ibm - RF
T <- length(y)
k <- 4						# Number of regressors (to be used in AIC)

fit_ibm_ff3 <- lm(y ~ Mkt_RF + SMB + HML)	# 3-factor FF regression
summary(fit_ibm_ff3)

e_ibm_ff3 <- fit_ibm_ff3$residuals		# Extract from lm function OLS residuals
RSS_ibm_ff3 <- sum(e_ibm_ff3^2)			# RSS
R2_ibm_ff3 <- 1 - RSS_ibm_ff3/as.numeric(t(y)%*%y)	# R^2
AIC <- log(RSS_ibm_ff3/T) + 2*k/T		# Akaike's IC

summary(fit_ibm_ff3)$r.squared
summary(fit_ibm_ff3)$adj.r.squared


### R^2 for 3-factor FF Regression for IBM + Noise factor
Noise_factor <- rnorm(T)					# Create irrelevant variable Noise factor
fit_ibm_ff3_N <- lm(y ~ Mkt_RF + SMB + HML + Noise_factor)	# 3-factor FF regression + Noise factor
summary(fit_ibm_ff3_N)

summary(fit_ibm_ff3_N)$r.squared				# Check that R^2 increased relative to R2_ibm_ff3
summary(fit_ibm_ff3_N)$adj.r.squared



#### Maximum Likelihood Example: Toss a coin 100. We get 60 heads 

p <- .50					# Initial value for p (=.50 if I believe coin is fair)
h <- 60						# 60 heads
n <- 100					# 100 times
factorial(n)/(factorial(n-h)*factorial(h))	# combination of 60 heads out of 100
choose(n,h)					# R function for combination of 60 heads out of 100
prob_h <- choose(n,h)*p^h*(1-p)^(n-h)		# using binomial probability of observing h heads
prob_h
dbinom(h,100,p)					# R way of calculating binomial prob of observing h heads

# Calculate the p that maximizes the probability of observing 60 heads
# Step 1 - Create Likelihood function
likelihood_b <- function(p){			# Create a prob function with p as an argument
  dbinom(h, 100, p)
}
likelihood_b(p)					# print likelihood

negative_likelihood_b <- function(p){   	# R uses a minimization algorithm, change sign of function
  dbinom(h, 100, p) * (-1)
}
negative_likelihood_b(p)

# Step 2 - Maximize (or Minimize negative Likelihood function)
nlm(negative_likelihood_b, p, stepmax=0.5) 	 #nlm minimizes the function



#### Normal sample: 5, 6, 7, 8, 9, 10 with unknown mean (& known SD=1)
mu <- 0						# Initial value for mu (=mean)
x_6 <- seq(5,10,by= 1)				# observed data x_6 = (5, 6, 7, 8, 9, 10)
x_6
dnorm(5, mu, sd=1)				# probability of observing x=5 when data is from N(0,1)
dnorm(x_6)					# probability of observing whole vector x_6 when data is from N(0,1)
prod(dnorm(x_6))				# likelihood function evaluated at observed data
l_f <- prod(dnorm(x_6))				# assign l_f to the likelihood function evaluated at observed data
l_f
log(l_f)					# log likelihood function evaluated at observed data
sum(log(dnorm(x_6)))				# compact notation for log likelihood function evaluated at observed data

# Calculate the mu that maximizes the probability of observing {5, 6, 7, 8, 9, 10} 
# Step 1 - create Likelihood function
likelihood_n <- function(mu){			# Create a prob function with mu as an argument
sum(log(dnorm(x_6, mu, sd=1)))
}
likelihood_n(mu)				# print likelihood

negative_likelihood_n <- function(mu){   	# R uses a minimization algorithm, change sign of function
sum(log(dnorm(x_6, mu, sd=1))) * (-1)
}
negative_likelihood_n(mu)


# Step 2 - Maximize (or Minimize negative Likelihood function)
results_n <- nlm(negative_likelihood_n, mu, stepmax=2) 	# nlm minimizes the function
results_n 					# Show nlm results

mu_max <- results_n$estimate			# Extract estimates
mu_max						# Should be equal to mean
likelihood_n(mu_max)				# Check max value at mu_max


# Standard Error of Mu
results_n <- nlm(negative_likelihood_n, mu, stepmax=2, hessian=TRUE) 
mu_hess <- results_n$hessian			# Extract estimated Hessian (Matrix of 2nd derivatives)
mu_hess 					# Show Hessian
cov<-solve(mu_hess) 				# Invert hessian to get cov(mu_MLE)
cov 						# Show variance for mu_MLE
stderr<-sqrt(diag(cov))				# Compute S.E. for mu_MLE
stderr



####  Normal sample: 5, 6, 7, 8, 9, 10 with unknown mean & SD
mu <- 0						# Initial value for mu (=mean)
sig <- 1					# Initial value for sig (=SD)
x_6 <- seq(5,10,by= 1)
x_6
dnorm(x_6)
sum(log(dnorm(x_6)))				# log likelihood function evaluated at observed data

x <- c(0,1)
# Step 1 - create Likelihood function
likelihood_lf <- function(x){			# Create a prob function with mu & sig as an argument
mu <- x[1]
sig <- x[2]
sum(log(dnorm(x_6, mu, sd=sig)))
}
likelihood_lf(x)				# print likelihood evaluated at initial values

negative_likelihood_lf <- function(x){   	# R uses a minimization algorithm, change sign of function
mu <- x[1]
sig <- x[2]
sum(log(dnorm(x_6, mu, sd=sig))) * (-1)
}
negative_likelihood_lf(x)


# Step 2 - Maximize (or Minimize negative Likelihood function)
results_lf <- nlm(negative_likelihood_lf, x, stepmax=4, hessian=TRUE) 	# nlm minimizes the function
results_lf 					# displays nlm results
par_max <- results_lf$estimate			# Extract estimates
par_max						# Should be equal to sample mean
likelihood_lf(par_max)				# Check max value of likelihood function


# If sigma^2 is also unknown
par_max[2]^2					# Second element in par_max vector is sigma2
sig_2 <- sum((x_6 - mu_max)^2)/6		# Dividing by T=6
var(x_6)					# s^2 => Dividing by (T-1)=5

# Compute S.E. by inverting Hessian
coeff_hess <- results_lf$hessian		# Extract Hessian
coeff_hess 					# Show Hessian
cov_lf <- solve(coeff_hess) 			# invert Hessian to get cov(MLE estimates)
cov_lf 						# Show covariance
se_lf <- sqrt(diag(cov_lf))			# Compute standard errors of MLE estimates
se_lf

par_max[1]/se_lf[1]				# t-ratio for mu
par_max[2]/se_lf[2]				# t-ratio for sigma



#### A more general construction of a Likelihood function under Normality
loglike_norm <- function(x, dat) 		# always use x to hold parameter values
{
mu <- x[1]
sig <- x[2]
T <- length(dat)
loglike<- -T * log(sig*sqrt(2*pi)) - sum((1/(2*sig^2)) * (dat - mu)^2) 
loglike <- loglike * (-1)			# Negative log likelihood
}
x0 <-c(0,1)
lf <- loglike_norm(x0,x_6) 
lf

results_lf <- nlm(loglike_norm, x0, dat=x_6, stepmax=4, hessian=TRUE)  	# nlm minimizes the function
results_lf 					# displays nlm results

coeff_max <- results_lf$estimate		# Extract estimates
coeff_hess <- results_lf$hessian		# Extract hessian
coeff_hess #display mles
cov_lf <- solve(coeff_hess) 			# invert hess to get cov(MLE estimates)
cov_lf 						# Display covariance
se_lf <- sqrt(diag(cov_lf))			# Compute standard errors of MLE estimates
se_lf


####  Linear Model - Fama-French 3-factor Model

SFX_da <- read.csv("http://www.bauer.uh.edu/rsusmel/4397/Stocks_FX_1973.csv",head=TRUE,sep=",")
names(SFX_da)
x_sp <- SFX_da$SP500
x_ibm <- SFX_da$IBM
x_Mkt_RF<- SFX_da$Mkt_RF
x_SMB <- SFX_da$SMB
x_HML <- SFX_da$HML
x_RF <- SFX_da$RF

T <- length(x_ibm)
lr_ibm <- log(x_ibm[-1]/x_ibm[-T])
lr_sp <- log(x_sp[-1]/x_sp[-T])
x0 <- matrix(1,T-1,1)
 Mkt_RF <- x_Mkt_RF[-1]/100
 SMB <- x_SMB[-1]/100
 HML <- x_HML[-1]/100
 RF <- x_RF[-1]/100

ibm_x <- lr_ibm - RF
X <- cbind(x0, Mkt_RF, SMB, HML)

# OLS results
fit_ibm_ff3 <- lm(ibm_x ~ Mkt_RF + SMB + HML)
summary(fit_ibm_ff3)



# Step 1 - Negative Likelihood function 
likelihood_ff3 <- function(theta, y ,X) {
N <- nrow(X)
k <- ncol(X)
beta <- theta[1:k]
sigma2 <- theta[k+1]^2
e <- y - X%*%beta
logl <- -.5*N*log(2*pi) -.5*N*log(sigma2) - ((t(e)%*%e)/(2*sigma2))
return(-logl)
}

theta <- c(0, 1, 1, 1, .1)			# initial values
likelihood_ff3(theta,ibm_x,X)


# Step 2 - Maximize (or Minimize negative Likelihood function)
results_ff3 <- nlm(likelihood_ff3, theta, hessian=TRUE, y=ibm_x, X=X) 	# nlm minimizes the function
results_ff3 					# displays nlm results
par_max_ff3 <- results_ff3$estimate		# Extract estimates
par_max_ff3					# Should be equal to sample mean
likelihood_ff3(par_max_ff3,ibm_x,X)		# Check max value of likelihood function


# Step 3 - Compute S.E. by inverting Hessian
coeff_hess_ff3 <- results_ff3$hessian		# Extract Hessian
coeff_hess_ff3 					# Show Hessian matrix
cov_ff3 <- solve(coeff_hess_ff3) 		# invert Hessian to get cov(MLE estimates)
cov_ff3 					# Show covariance matrix
se_ff3 <- sqrt(diag(cov_ff3))			# Compute standard errors of MLE estimates
se_ff3

(par_max_ff3[2] -1)/se_ff3[2]			# t-test for H0: beta=1
par_max_ff3[5]/se_ff3[5]			# t-ratio for sigma



#### Inflate data (work with 1% instead of .01) to avoid strange values for (positive) negative log L 
ibm_x100 <- (lr_ibm - RF)*100
X100 <- cbind(x0, Mkt_RF, SMB, HML)*100

fit_ff3100 <- lm(ibm_x100 ~ X100 - 1)
summary(fit_ff3100)

theta <- c(0, 1, 1, 1, .1)				# initial values
likelihood_ff3(theta,ibm_x100,X100)

results_ff3 <- nlm(likelihood_ff3, theta, hessian=TRUE, y=ibm_x100, X=X100) 	# nlm minimizes the function
results_ff3 					# displays nlm results
par_max_ff3 <- results_ff3$estimate		# Extract estimates
par_max_ff3					# Should be equal to sample mean
likelihood_ff3(par_max_ff3,ibm_x100,X100)	# Check max value of likelihood function



##### Data Problems: Outliers & Multicollinearity
y <- ibm_x 
x <- cbind(x0, Mkt_RF, SMB, HML)
k <- ncol(x)
dat_xy <- data.frame(y, x)

summary(fit_ibm_ff3)

y[10] <- -0.85
y[90] <- 0.95
fit_ibm_ff3_out <- lm(y ~ Mkt_RF + SMB + HML)
summary(fit_ibm_ff3_out)

# Plot residuals
x_resid <- residuals(fit_ibm_ff3)
plot(x_resid, typ ="l", col="blue", main ="IBM Residuals from 3 FF Factor Model", xlab="Date", ylab="IBM residuals")

# Plot standardized residuals
x_stand_resid <- x_resid/sd(x_resid) 	# standardized residuals
plot(x_stand_resid, typ ="l", col="blue", main ="IBM Standardized Residuals from 3 FF Factor Model", xlab="Date", ylab="IBM residuals")

cooksd <- cooks.distance(fit_ibm_ff3)

# plot Cook's distance
plot(cooksd, pch="*", cex=2, main="Influential Obs by Cooks distance") 
# add cutoff line
abline(h = 4*mean(cooksd, na.rm=T), col="red")  # add cutoff line
# add labels
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T), names(cooksd),""), col="red")  # add labels

# influential row numbers
influential <- as.numeric(names(cooksd)[(cooksd > 4*mean(cooksd, na.rm=T))])  
 # print first 10 influential observations.
head(data_xy[influential, ], n=10L)

install.packages("olsrr") 

library(olsrr)				# need to install package olsrr
sum(x_stand_resid > 2)			# Rule of thumb count (5% count is OK)
x_lev <- ols_leverage(fit_ibm_ff3)	# leverage residuals
sum(x_lev > (2*k+2)/T) 			# Rule of thumb count (5% count is OK)
sum(cooksd > 4/T)			# Rule of thumb count (5% count is OK)


sum(x_stand_resid > 2)/T		# proportion of "outliers"
sum(x_lev > (2*k+2)/T)/T		# proportion of "outliers"	 
sum(cooksd > 4/T)/T			# proportion of "outliers"

ols_plot_resid_stand(fit_ibm_ff3)	# Plot Standardize residuals

ols_plot_cooksd_bar(fit_ibm_ff3) 	# Plot Cook’s D measure

ols_plot_dfbetas(fit_ibm_ff3)		# Plot Difference in betas

ols_vif_tol(fit_ibm_ff3)

ols_eigen_cindex(fit_ibm_ff3)