############# Regression with lm function and Linear Algebra (Code #3B - Used in Lecture 6) #############

#### Regression using R with lm fuction

SFX_da <- read.csv("https://www.bauer.uh.edu/rsusmel/4397/Stocks_FX_1973.csv", head=TRUE, sep=",")
names(SFX_da)
summary(SFX_da)

# Extract variables from imported data
x_ibm <- SFX_da$IBM					# extract IBM price data
x_Mkt_RF <- SFX_da$Mkt_RF				# extract Market excess returns (in %)
x_RF <- SFX_da$RF					# extract Risk-free rate (in %)

# Define log returns & adjust size of variables accordingly
T <- length(x_ibm)					# sample size
lr_ibm <- log(x_ibm[-1]/x_ibm[-T])			# create IBM log returns (in decimal returns)
Mkt_RF <- x_Mkt_RF[-1]/100				# Adjust sample size to ( T-1) by removing 1st obs 
RF <- x_RF[-1]/100					# Adjust sample size and use decimal returns.

# Scatter plot of the two varialbes
plot(lr_ibm, Mkt_RF, main="Scatterplot: IBM & Market",
   xlab="IBM returns ", ylab="Market returns ", pch=19)

# Regression using lm
ibm_x <- lr_ibm - RF					# IBM excess returns
T <- length(ibm_x)

fit_ibm_capm <- lm(ibm_x ~ Mkt_RF)			# lm (=linear model) package in R
summary(fit_ibm_capm)					# print lm results

# Scatter plot of the two varialbes with regression line
plot(lr_ibm, Mkt_RF, main="Scatterplot: IBM & Market",
   xlab="IBM returns ", ylab="Market returns ", pch=19)
abline(fit_ibm_capm, col="blue")

# Expected (CAPM) vs Realized IBM return
exp_ret <- fit_ibm_capm$coefficients[2] * mean(Mkt_RF)	# Monthly expected return according to CAPM
exp_ret
mean(ibm_x)						# Realized return
mean(ibm_x) - exp_ret 					# Over-r-performance if positive / Under-performance if negative

# Regression using lm with R data frame, with different sample sizes
data_full <- data.frame(ibm_x, Mkt_RF)

fit_ibm_capm_full <- lm(ibm_x ~ Mkt_RF, data = data_full)	# lm reg with full data
summary(fit_ibm_capm_full)			
mean(Mkt_RF) * 12						# Annualized Market excess return

T0 <- 1
T1 <- 200			# August 1989
data_short <- data_full[T0:T1, ]

fit_ibm_capm_short <- lm(ibm_x ~ Mkt_RF, data = data_short)	# lm reg with shorter data
summary(fit_ibm_capm_short)
mean(Mkt_RF[T0:T1]) * 12					# Annualized Market excess return

T0 <- 201
T1 <- 400			# May 2006
data_short <- data_full[T0:T1, ]

fit_ibm_capm_short <- lm(ibm_x ~ Mkt_RF, data = data_short)	# lm reg with shorter data
summary(fit_ibm_capm_short)
mean(Mkt_RF[T0:T1]) * 12					# Annualized Market excess return

T0 <- 401
T1 <- T				# End of Sample (Dec 2024)
data_short <- data_full[T0:T1, ]

fit_ibm_capm_short <- lm(ibm_x ~ Mkt_RF, data = data_short)	# lm reg with shorter data
summary(fit_ibm_capm_short)
mean(Mkt_RF[T0:T1]) * 12					# Annualized Market excess return

## Note: Beta coeff are "time-varying" - The beta for the whole period is an average of betas over different periods
 (0.779049 * 223 + 1.076962 * 200 + 0.790583 * 200)/T
fit_ibm_capm_full$coefficients[2]


#### Matrices and Regression with Linear Algebra

v1 <- c(1, 3, 8)				# a (3x1) vector (vectors are usually treated as a column list)
v1

A <- matrix(c(3, 2, 1, 9, 8, 7), ncol = 3)	# a (2x3) matrix
A

B <- matrix(c(1, 2, 0, 1, 1, 1), nrow = 3)
B

v1 <- c(1, 3, 8)				# a (3x1) vector
v2 <- c(2, 7, 9)

# Use rbind
A <- rbind(v1, v2)
A						# a (2x3) matrix

# Use cbind
 v3 <- c(1, 2, 0)
 v4 <- c(1, 1, 1)
B <- cbind(v3,v4)
B						# a (3x2) matrix

# Matrix multiplication: %*%
C <- A%*%B					#A is 2x3; B is 3x2	=> C is 2x2
C

# Scalar multiplication: * 	
2 * C						# elementwise multiplication of C by scalar 2

# Dot product of 2 vectors: v1 "." v2 - Sum of elementwise multiplied elements of the two vectos
t(v1) %*% v2					# v1 <- c(1, 3, 8) & v2 <- c(2, 7, 9)	=> sum = 1*2 + 3*7 + 8*9

# Dot product with a vector itself: v1 "." - v1 produces a sum of the square elements of vector
t(v1) %*% v1					# sum = 1^2 + 3^2 + 8^2

# Dot product with ί (a vector of ones): sum of elements of vector
i <- c(1,1,1); 
t(i) %*% v1					# v1 <- c(1, 3, 8)	=> sum = 1 + 3 + 8

# Product of 2 vectors: v1 & t(v2): A (3x3) matrix. 
v1%*%t(v2)	 				# v1 <- c(1, 3, 8)  --a (3x1) vector x (1x3) vector: v2 <- c(2, 7, 9)

# Transpose
B
t(B) 						#B is 3x2  => t(B) is 2x3

# X'X (a symmetric matrix)
t(B)%*%B

# Determinant: det(D)  		(D matrix needs to be square)
det(t(B)%*%B)

# Inverse of (X'X): solve	
solve(t(B)%*%B) 	

# Diagonal elements of a matrix A: diag()
diag(solve(t(B)%*%B))

# Square root of (positive) elements of a matrix A: sqrt()
sqrt(diag(solve(t(B)%*%B)))


# Regression with Linear Algebra
y <- lr_ibm - RF
T <- length(ibm_x)
x0 <- matrix(1,T,1)
x <- cbind(x0, Mkt_RF)
b <- solve(t(x)%*% x)%*% t(x)%*%y 		# b = inv(X′X) X′y  (OLS regression)
b

y_fit <- x%*%b					# fitted values
e <- y - x%*%b					# OLS "errors" = residuals
e2 <- e^2					# squared residual vector
sum_e2 <- sum(e2)				# sum of squared residuals
sum_e2_1 <- t(e)%*%e				# sum of squared residuals using dot product

plot(y_fit, type="l", main="CAPM fitted values for IBM")
lines(y, col="red")

plot(e, type="l", main="CAPM residuals for IBM")


### Multiple Regression with Linear Algebra (with the 3 Fama-French factors)
x_SMB <- SFX_da$SMB				# extract Size factor (SMB returns, in %)
x_HML <- SFX_da$HML				# extract Value factor (HML returns, in %)
SMB <- x_SMB[-1]/100				# Adjust size (remove 1st observation) and convert to decimal returns.
HML <- x_HML[-1]/100				# Adjust size (remove 1st observation) and convert to decimal returns.
y <- ibm_x
T <- length(y)
x0 <- matrix(1,T,1)
x <- cbind(x0, Mkt_RF, SMB, HML)		# Matrix X (with 4 columns = constant + 3 factors)
k <- ncol(x)					# number of regressors (including constant)
k
b <- solve(t(x)%*% x)%*% t(x)%*%y	 	# b = (X'X)^(-1) X' y  	(OLS regression)
b

# We get the same results using lm()
fit_capm_ff3 <- lm (y ~ Mkt_RF + SMB + HML)
summary(fit_capm_ff3)