```
Model Concepts in SPLUS

## How factors are represented in linear models: Suppose we have a factor with 4 levels

# dummy coding to create 3 dummy variables, represented by the columns:
contr.treatment(4)

# effect coding to create 3 new variables:
contr.sum(4)

# helmert contrasts (new variables are represented by the columns):
contr.helmert(4)

# orthogonal polynomials (for numeric levels with equal spacings or order factors with levels
# that are assumed to have equal numeric spacings)
contr.poly(4)

# For unequal spacing, consult the Kirk(1995) copy I gave you.

# User-defined contrasts

# Suppose we have a factor with four levels like Age of the quine data set.
# We can create a set of contrasts (orthogonal or not) among the 4 levels

attach(quine)
my.Age<-Age
contrasts(my.Age)<-matrix(c(	1,   1,  -1, -1,
1,  -1,  -1,  1,
-.5, .5, -.5, .5 ), nr=4, nc=3, byrow=F)
contrasts(my.Age)

# and then use my.Age in a linear model.  These contrasts will be used.

model1<-lm(log(Days+1) ~ my.Age, data=quine)

model1\$contrasts

# You can also create user-defined contrasts using the C function (see below)

# To override the contrasts on my.Age, use C:
model1<-lm(log(Days+1) ~ C(my.Age, sum) , data=quine)
model1\$contrasts

# To change contrasts options for all factors: use the options settings

options()\$contrasts
options(contrasts=c(factor="contr.treatment", ordered="contr.poly"))

#### Overdetermined models

# exactly singular matrix
attach(quine)
junk<-cbind(unique(Age), poly(unique(Age), 3))

# this is the linear component of poly(unique(my.Age), 3), which is a
# linear combination of my.Age levels
(junk[,1]-2.5)/vecnorm((junk[,1]-2.5),p=2)

# So, my.Age levels can be recovered using
2.236068*junk[,2] + 2.5

# And, the second column of junk is a linear combination of the
# first column and an intercept column

# rank of this matrix along with an intercept column is 4, not 5
qr(cbind(1,junk))\$rank

## using singular.ok
lm(log(Days+1) ~ cbind(Age, poly((Age), 3)), data=quine)
lm(log(Days+1) ~ cbind(Age, poly((Age), 3)), data=quine, singular.ok=T)

## nearly singular X

m1<-lm(Fuel~Weight, data=fuel.frame)
m2<-update(m1, . ~ . + I(Weight + rnorm(length(Weight),sd=.01)) )
kappa(m2)
kappa(m1)
m3<-update(m2, . ~ ., method="svd")

m2
m3

cor(model.matrix(m2)[,-1])
summary(m2)

##### Interactions and nesting:

### Interaction between one numeric variable and one factor

options(contrasts=c("contr.treatment", "contr.poly"))	# set all factors in formulas to use dummy coding

# These are all equivalent fits, but different way of showing information

# separate slopes
model1<-lm(Fuel ~ Type*Weight, data=fuel.frame, subset=Type=="Small"|Type=="Sporty")
# separate slopes (same as above, different coefficients)
model2<-lm(Fuel ~ Type/Weight, data=fuel.frame, subset=Type=="Small"|Type=="Sporty")
# separate slopes and intercepts
model3<-lm(Fuel ~ Type/Weight - 1, data=fuel.frame, subset=Type=="Small"|Type=="Sporty")

model2
model1
model3

# Which Type gets the 0 and which gets the 1?
model1\$contrasts
model2\$contrasts

model.matrix(model1)
model.matrix(model2)

options(contrasts=c("contr.sum", "contr.poly"))		# effect coding
model1<-lm(Fuel ~ Type*Weight, data=fuel.frame, subset=Type=="Small"|Type=="Sporty")
model2<-lm(Fuel ~ Type/Weight, data=fuel.frame, subset=Type=="Small"|Type=="Sporty")

model2
model1

# Which Type gets the -1 and which gets the 1?
model1\$contrasts
model2\$contrasts

model.matrix(model1)
model.matrix(model2)

# Next two are equivalent since Type is the only factor
model3<-lm(Fuel~Type*Weight, data=fuel.frame)
model4<-lm(Fuel ~ C(Type, treatment)*Weight, data=fuel.frame) # use sum (or effect) contrasts for Type

## Add another level to the factor
options(contrasts=c("contr.treatment", "contr.poly"))	# dummy coding
model1<-lm(Fuel ~ Type*Weight, data=fuel.frame, subset=Type=="Small"|Type=="Sporty" | Type=="Compact")
model2<-lm(Fuel ~ Type/Weight, data=fuel.frame, subset=Type=="Small"|Type=="Sporty"| Type=="Compact")

model2
model1

# What is the coding used?
model1\$contrasts
model2\$contrasts

model.matrix(model1)
model.matrix(model2)

options(contrasts=c("contr.sum", "contr.poly"))	# effect coding
model1<-lm(Fuel ~ Type*Weight, data=fuel.frame, subset=Type=="Small"|Type=="Sporty" | Type=="Compact")
model2<-lm(Fuel ~ Type/Weight, data=fuel.frame, subset=Type=="Small"|Type=="Sporty"| Type=="Compact")

model2
model1

# What is the coding used? (Definition of the effect coded variables)
model1\$contrasts
model2\$contrasts

model.matrix(model1)
model.matrix(model2)

### Interaction between two factors (Two-way ANOVA)

options(contrasts=c("contr.treatment", "contr.poly"))	# Use dummy coding (contr.treatment) for factors

# not all countries occur for each type.  So, not all parameters can be estimated.
# singular.ok=T removes all all-zero columns
model1<-lm(log(Price) ~ Country*Type, singular.ok=T, data=cu.summary)
model2<-lm(log(Price) ~ Type/Country, singular.ok=T, data=cu.summary)	# Country within Type

# which are all-zero columns of the model matrix
dimnames(model.matrix(model1))[[2]][which(apply(model.matrix(model1),2,function(x) all(x==0)))]

# create a variable within which to nest (Country=="USA")
attach(cu.summary)
cu.summary\$Small.Type<-(Type=="Small" | Type=="Sporty" | Type=="Compact")

model1<-lm(log(Price) ~ (Country=="USA")*Small.Type, data=cu.summary)
model2<-lm(log(Price) ~ Small.Type/(Country=="USA"), data=cu.summary)	# Country within Small.Type Status

summary(model1,cor=F)
summary(model2, cor=F)

# What is the coding used? (Definition of the effect coded variables)
model1\$contrasts
model2\$contrasts

model.matrix(model1)
model.matrix(model2)

# ANOVA tables
anova(model1, ssType=3)
anova(model2, ssType=3)		# For balanaced designs, SScountry(type) = SScountryXType + SScountry.
# But, this is not a balanced design

options(contrasts=c("contr.sum", "contr.poly"))		# effect coding for factors
attach(cu.summary)
cu.summary\$Small.Type<-(Type=="Small" | Type=="Sporty" | Type=="Compact")
model1<-lm(log(Price) ~ (Country=="USA")*Small.Type, data=cu.summary)
model2<-lm(log(Price) ~ Small.Type/(Country=="USA"), data=cu.summary)	# Country within Type

model1
model2

# What is the coding used? (Definition of the effect coded variables)
model1\$contrasts
model2\$contrasts

model.matrix(model1)
model.matrix(model2)

## Interaction between two numeric variables

lm(Fuel ~ Weight*Mileage, data=fuel.frame)
lm(Fuel ~ (Weight+Mileage)^2, data=fuel.frame)	# equivalent

lm(Fuel ~ I(Weight/Mileage), data=fuel.frame)
lm(Fuel ~ I(Weight*Mileage), data=fuel.frame)
lm(Fuel ~ I(Weight^2)+I(Mileage^2), data=fuel.frame)
lm(Fuel ~ (Weight^2) + (Mileage^2), data=fuel.frame)	# equivalent

# polynomial regression (with orthogonal polynomials)

attach(fuel.frame)
model1<-lm(log(Tank)~ C(Tires,poly,3) + Disp2 + HP.revs, data=cu.specs)	# fit up to a cubic component in Tires
model.matrix(model1)

summary(model1, cor=F)

## Updating linear models

# Remove cubic component
model2<-update(model1, . ~ . -C(Tires,poly,3) + C(Tires,poly,2))
summary(model2, cor=F)

## Fitting all predictors in a data frame

lm(Fuel~., data=fuel.frame)

## User-defined contrasts

attach(quine)

# Here we want to contrast first two Age groups with last two
# We also get two additional independent contrasts, which we can ignore
model1<-lm(log(Days+1)~Eth + C(Age,c(1,1,-1,-1)), data=quine)

# to see what the additional contrasts are:
model1\$contrasts

## Model Frames and model matrices

attach(quine)
mmatrix<-model.matrix(log(Days + 1) ~ Eth + C(Age,c(1,1,-1,-1)) + Lrn)
mframe<-model.frame(model.matrix(log(Days + 1) ~ Eth + C(Age,c(1,1,-1,-1)) + Lrn))

attr(mmatrix, "assign")

# getting the model frame from an lm fit
lm(log(Days + 1) ~ Eth + C(Age,c(1,1,-1,-1)) + Lrn, data=quine, method="model.frame")

## Adding and dropping one term at a time and testing the resulting model