Model Concepts in SPLUS

## How factors are represented in linear models: Suppose we have a factor with 4 levels

# dummy coding to create 3 dummy variables, represented by the columns:
contr.treatment(4)

# effect coding to create 3 new variables:
contr.sum(4)

# helmert contrasts (new variables are represented by the columns):
contr.helmert(4)

# orthogonal polynomials (for numeric levels with equal spacings or order factors with levels 
# that are assumed to have equal numeric spacings)
contr.poly(4)

# For unequal spacing, consult the Kirk(1995) copy I gave you.


# User-defined contrasts

# Suppose we have a factor with four levels like Age of the quine data set.
# We can create a set of contrasts (orthogonal or not) among the 4 levels

attach(quine)
my.Age<-Age
contrasts(my.Age)<-matrix(c(	1,   1,  -1, -1,
				1,  -1,  -1,  1,
				-.5, .5, -.5, .5 ), nr=4, nc=3, byrow=F)
contrasts(my.Age)
			
# and then use my.Age in a linear model.  These contrasts will be used.

model1<-lm(log(Days+1) ~ my.Age, data=quine)

model1$contrasts


# You can also create user-defined contrasts using the C function (see below)

# To override the contrasts on my.Age, use C:
model1<-lm(log(Days+1) ~ C(my.Age, sum) , data=quine)
model1$contrasts


# To change contrasts options for all factors: use the options settings

options()$contrasts
options(contrasts=c(factor="contr.treatment", ordered="contr.poly"))


#### Overdetermined models

# exactly singular matrix
attach(quine)
junk<-cbind(unique(Age), poly(unique(Age), 3))

# this is the linear component of poly(unique(my.Age), 3), which is a 
# linear combination of my.Age levels
(junk[,1]-2.5)/vecnorm((junk[,1]-2.5),p=2)

# So, my.Age levels can be recovered using 
2.236068*junk[,2] + 2.5

# And, the second column of junk is a linear combination of the 
# first column and an intercept column

# rank of this matrix along with an intercept column is 4, not 5
qr(cbind(1,junk))$rank

## using singular.ok
lm(log(Days+1) ~ cbind(Age, poly((Age), 3)), data=quine)
lm(log(Days+1) ~ cbind(Age, poly((Age), 3)), data=quine, singular.ok=T)


## nearly singular X

m1<-lm(Fuel~Weight, data=fuel.frame)
m2<-update(m1, . ~ . + I(Weight + rnorm(length(Weight),sd=.01)) )
kappa(m2)
kappa(m1)
m3<-update(m2, . ~ ., method="svd")

m2
m3

cor(model.matrix(m2)[,-1])
summary(m2)


##### Interactions and nesting:

### Interaction between one numeric variable and one factor

options(contrasts=c("contr.treatment", "contr.poly"))	# set all factors in formulas to use dummy coding

# These are all equivalent fits, but different way of showing information

# separate slopes
model1<-lm(Fuel ~ Type*Weight, data=fuel.frame, subset=Type=="Small"|Type=="Sporty")	
# separate slopes (same as above, different coefficients)
model2<-lm(Fuel ~ Type/Weight, data=fuel.frame, subset=Type=="Small"|Type=="Sporty")	
# separate slopes and intercepts
model3<-lm(Fuel ~ Type/Weight - 1, data=fuel.frame, subset=Type=="Small"|Type=="Sporty")	

model2
model1
model3

# Which Type gets the 0 and which gets the 1?
model1$contrasts
model2$contrasts

model.matrix(model1)
model.matrix(model2)

options(contrasts=c("contr.sum", "contr.poly"))		# effect coding
model1<-lm(Fuel ~ Type*Weight, data=fuel.frame, subset=Type=="Small"|Type=="Sporty")
model2<-lm(Fuel ~ Type/Weight, data=fuel.frame, subset=Type=="Small"|Type=="Sporty")

model2
model1

# Which Type gets the -1 and which gets the 1?
model1$contrasts
model2$contrasts

model.matrix(model1)
model.matrix(model2)


# Next two are equivalent since Type is the only factor
model3<-lm(Fuel~Type*Weight, data=fuel.frame)
model4<-lm(Fuel ~ C(Type, treatment)*Weight, data=fuel.frame) # use sum (or effect) contrasts for Type



## Add another level to the factor
options(contrasts=c("contr.treatment", "contr.poly"))	# dummy coding
model1<-lm(Fuel ~ Type*Weight, data=fuel.frame, subset=Type=="Small"|Type=="Sporty" | Type=="Compact")
model2<-lm(Fuel ~ Type/Weight, data=fuel.frame, subset=Type=="Small"|Type=="Sporty"| Type=="Compact")

model2
model1

# What is the coding used?
model1$contrasts
model2$contrasts

model.matrix(model1)
model.matrix(model2)


options(contrasts=c("contr.sum", "contr.poly"))	# effect coding
model1<-lm(Fuel ~ Type*Weight, data=fuel.frame, subset=Type=="Small"|Type=="Sporty" | Type=="Compact")
model2<-lm(Fuel ~ Type/Weight, data=fuel.frame, subset=Type=="Small"|Type=="Sporty"| Type=="Compact")

model2
model1

# What is the coding used? (Definition of the effect coded variables)
model1$contrasts
model2$contrasts

model.matrix(model1)
model.matrix(model2)



### Interaction between two factors (Two-way ANOVA)

options(contrasts=c("contr.treatment", "contr.poly"))	# Use dummy coding (contr.treatment) for factors

# not all countries occur for each type.  So, not all parameters can be estimated. 
# singular.ok=T removes all all-zero columns
model1<-lm(log(Price) ~ Country*Type, singular.ok=T, data=cu.summary)
model2<-lm(log(Price) ~ Type/Country, singular.ok=T, data=cu.summary)	# Country within Type 

# which are all-zero columns of the model matrix
dimnames(model.matrix(model1))[[2]][which(apply(model.matrix(model1),2,function(x) all(x==0)))]


# create a variable within which to nest (Country=="USA")
attach(cu.summary)
cu.summary$Small.Type<-(Type=="Small" | Type=="Sporty" | Type=="Compact")

model1<-lm(log(Price) ~ (Country=="USA")*Small.Type, data=cu.summary)
model2<-lm(log(Price) ~ Small.Type/(Country=="USA"), data=cu.summary)	# Country within Small.Type Status


summary(model1,cor=F)
summary(model2, cor=F)

# What is the coding used? (Definition of the effect coded variables)
model1$contrasts
model2$contrasts

model.matrix(model1)
model.matrix(model2)


# ANOVA tables
anova(model1, ssType=3)
anova(model2, ssType=3)		# For balanaced designs, SScountry(type) = SScountryXType + SScountry.  
				# But, this is not a balanced design


options(contrasts=c("contr.sum", "contr.poly"))		# effect coding for factors
attach(cu.summary)
cu.summary$Small.Type<-(Type=="Small" | Type=="Sporty" | Type=="Compact")
model1<-lm(log(Price) ~ (Country=="USA")*Small.Type, data=cu.summary)
model2<-lm(log(Price) ~ Small.Type/(Country=="USA"), data=cu.summary)	# Country within Type

model1
model2

# What is the coding used? (Definition of the effect coded variables)
model1$contrasts
model2$contrasts

model.matrix(model1)
model.matrix(model2)


## Interaction between two numeric variables

lm(Fuel ~ Weight*Mileage, data=fuel.frame)
lm(Fuel ~ (Weight+Mileage)^2, data=fuel.frame)	# equivalent

lm(Fuel ~ I(Weight/Mileage), data=fuel.frame)
lm(Fuel ~ I(Weight*Mileage), data=fuel.frame)	
lm(Fuel ~ I(Weight^2)+I(Mileage^2), data=fuel.frame)
lm(Fuel ~ (Weight^2) + (Mileage^2), data=fuel.frame)	# equivalent


# polynomial regression (with orthogonal polynomials)

attach(fuel.frame)
model1<-lm(log(Tank)~ C(Tires,poly,3) + Disp2 + HP.revs, data=cu.specs)	# fit up to a cubic component in Tires
model.matrix(model1)

summary(model1, cor=F)


## Updating linear models

# Remove cubic component
model2<-update(model1, . ~ . -C(Tires,poly,3) + C(Tires,poly,2))
summary(model2, cor=F)


## Fitting all predictors in a data frame

lm(Fuel~., data=fuel.frame)


## User-defined contrasts

attach(quine)

# Here we want to contrast first two Age groups with last two
# We also get two additional independent contrasts, which we can ignore
model1<-lm(log(Days+1)~Eth + C(Age,c(1,1,-1,-1)), data=quine)	

# to see what the additional contrasts are:													
model1$contrasts		


## Model Frames and model matrices

attach(quine)
mmatrix<-model.matrix(log(Days + 1) ~ Eth + C(Age,c(1,1,-1,-1)) + Lrn)
mframe<-model.frame(model.matrix(log(Days + 1) ~ Eth + C(Age,c(1,1,-1,-1)) + Lrn))

attr(mmatrix, "assign")

# getting the model frame from an lm fit
lm(log(Days + 1) ~ Eth + C(Age,c(1,1,-1,-1)) + Lrn, data=quine, method="model.frame")


## Adding and dropping one term at a time and testing the resulting model

See functions add1 and drop1