####### Code for HW 2 - Version: 2025

#### Question 1 ####

SH_da <- read.csv("http://www.bauer.uh.edu/rsusmel/4397/Shiller_data.csv", head=TRUE, sep=",")
x_date <- SH_da$Date
x_P <- SH_da$P 
x_D <- SH_da$D
x_E <- SH_da$E
T <- length(x_S)
cpi_us <- SH_da$CPI
i_us <- SH_da$Long_i

T <- length(x_P)
lr_p <- log(x_P[-1]/x_P[-T])
lr_d <- log(x_D[-1]/x_D[-T])
lr_e <- log(x_E[-1]/x_E[-T])
inf <- log(cpi_us[-1]/cpi_us[-T])
int <- (i_us[-1])/100

T <- length(lr_p)


## a. Report Regression
fit_mod <- lm (lr_p ~ lr_e + inf + int)
summary(fit_mod)

## b. Interpret R^2 (the model explains % of the variability of stock returns. Or more precisely, % of the variability of stock returns that is explained by the variability of earnings, inflation & interest rates)

## c. Interpret coefficient beta_1 (If earnings increase by 1%, stock returns increase by beta_1%

## d. Report F-stat in fit_mod output. Based on p-value reject reject H0.

## e. F test of Restrictions (beta_1 = beta_3 = 0)
e_u <- fit_mod$residuals
RSS_u <- sum(e_u^2)				# Unrestricted RSS

fit_r	<- lm (lr_p ~ inf )			# Restricted model
e_r <- fit_r$residuals
RSS_r <- sum(e_r^2)				# Restricted RSS

F_test_1 <- ((RSS_r - RSS_u)/2)/(RSS_u/(T-4))	# F-test
F_test_1
qf(.95, df1 = 2, df2= T-4)			# F-value from table (if larger than F_test_1 cannot reject H0)
p_val <- 1 - pf(F_test_1, df1 = 2, df2= T-4) 	# p-value of F-test
p_val

## You could have also used package car. 
library(car)
linearHypothesis(fit_mod, c("lr_e = 0","int = 0"), test="F") 	# F: exact test 

## f. Testing J=2 Hypothesis with R package car
library(car)
linearHypothesis(fit_mod, c("inf = 0.5","int = -0.1"), test="F") 	# F: exact test 


#### Question 2 ####

fit_mod <- lm (lr_p ~ lr_e + inf + int)
sh_ols <- summary(fit_mod)$coef[,1]		#extracting OLS estimated parameters from lm (1st row in table)

dat_Sh <- data.frame(lr_p, lr_e, inf, int)	# create R dataframe to use in lmboot	
sim_size <- 1000				# R will mention that it's recommended to habe B=sim_size > T. You can set B = 2000	

library(lmboot)
sh_b <- paired.boot(lr_p ~ lr_e + inf + int, data=dat_Sh, B=sim_size)	#paired bootstrap in lm

## a. Mean values for b (bootstrap estimates of b)
mean(sh_b$bootEstParam[,1])		# print mean of bootstrap samples for constant
mean(sh_b$bootEstParam[,2]) 		# print mean of bootstrap samples for lr_e
mean(sh_b$bootEstParam[,3])		# print mean of bootstrap samples for inf
mean(sh_b$bootEstParam[,4]) 		# print mean of bootstrap samples for int

# Note: You can get from lmboot the OLS estimated parameters
sh_b$origEstParam			# OLS ("Original") estimated parameters

## a. bootstrap bias (OLS estimated parameter - Bootstrap estimated parameter)
sh_b$origEstParam[1] - mean(sh_b$bootEstParam[,1])
sh_b$origEstParam[2] - mean(sh_b$bootEstParam[,2])
sh_b$origEstParam[3] - mean(sh_b$bootEstParam[,3])
sh_b$origEstParam[4] - mean(sh_b$bootEstParam[,4])

## b. Use quantile function to get 95% CI for beta_2 (inf) parameter (percentile method)
quantile(sh_b$bootEstParam[,3], probs=c(.025, .975))


#### Question 3 ####
## 3.a - Plot residuals 
x_resid <- residuals(fit_mod)
x_stand_resid <- x_resid/sd(x_resid) 	# standardized residuals
plot(x_resid, typ ="l", col="blue", main ="Shiller Residuals from Model", xlab="Date", ylab="Mod residuals")

## 3.a - Cook's D
cooksd <- cooks.distance(fit_mod)
# Plot Cook's distance
plot(cooksd, pch="*", cex=2, main="Influential Obs by Cooks distance") 
# add cutoff line
abline(h = 4*mean(cooksd, na.rm=T), col="red")  # add cutoff line
# add labels
text(x=1:length(cooksd)+1, y=cooksd, labels=ifelse(cooksd>4*mean(cooksd, na.rm=T), names(cooksd),""), col="red")  # add labels

## 3.b - Proportion of Violations of Rules of Thumb
T <- length(x_resid)
k <- length(fit_mod$coefficients)
library(olsrr)				# need to install package olsrr
sum(x_stand_resid > 2)			# Rule of thumb count (5% count is OK)
x_lev <- ols_leverage(fit_mod)		# leverage residuals
sum(x_lev > (2*k+2)/T) 			# Rule of thumb count (5% count is OK)
sum(cooksd > 4/T)			# Rule of thumb count (5% count is OK)

sum(x_stand_resid > 2)/T		# proportion of "outliers" - stand resid
sum(x_lev > (2*k+2)/T)/T		# proportion of "outliers" - leverage	 
sum(cooksd > 4/T)/T			# proportion of "outliers" - Cook's D

## Interpretation: Overall, there's no strong evidence of outliers (only leverage has a proportion slightly bigger than 5%), and there's no evidence of Multicollinearity (VIF and Condition index are all lower than the Rules of Thumb)


#### Question 4 ####
Check Notes