#!/usr/local/bin/R
# Xavier Fernández i Marín
# August 2006
# http://xavier-fim.net
# Dealing with variables
# This script is very focused on treating typical variables from surveys
# used in the Political and Social Sciences.
# Simulate data
# Simulate independent variables
obs <- 1000
age <- round(runif(obs, min=18, max=60), digits=0)
# uniform distribution
income <- rgamma(obs, scale=20, shape=2)
# distribution skewed, with a big tail in the right
sex <- gl(2, 1, length=obs, labels=c("male", "female"))
# factor with only two possible outcomes
group <- rep(gl(5, 1, labels=c("A", "B", "C", "D", "E")), obs/5)
# factor with 5 possible outcomes
# Simulate some kind of relationship with another variable
y <- 5*(rep(1, obs)) +
1.5*age - 0.05*age^2 +
8.5*as.numeric(sex) +
0.5*income +
2.1*income*as.numeric(sex) +
5.3*as.numeric(group)[c(2,5,4,3,1)] +
rnorm(obs, mean=100, sd=15)
# We have simply assign some random numbers to the variables, with:
# a nonlinear relationship between age and y
# an interaction term between income and sex
# a constant difference between 'group' of 5.3, but with no relationship
# with the order of the groups
# a constant with value 5 (we have added a vector of 1's)
### Inspect variables with an apropiate instrument
hist(age)
hist(income)
table(sex)
table(group, exclude=NULL) # to get also missing values
# which are hidden by default
### Recode
# Say that, instead of 5 groups we want to recode them into 3 groups
group.2 <- rep(NA, length(group))
group.2[group=="A" | group=="B"] <- "A-B"
group.2[group=="C" | group=="D" | group=="E"] <- "C-D-E"
group.2 <- as.factor(group.2) # Be sure that we end up with a factor
# If we include "group" into any regression it will take 'A' as the base
# category.
# We can change it using relevel()
group <- relevel(group, ref="D")
# We want to group the income variable in ranges
income.discrete <- cut(income, breaks=c(0, 10, 30, 50, max(income)))
# assigning ourselves the cutting points
income.discrete.2 <- cut(income, breaks=5)
# giving the number of intervals
### Inspect bivariate relationships between the y and all other variables
plot(age, y)
plot(income, y) # simple scatterplots
plot(sex, y)
plot(group, y) # yes, R knows what to do ...
# (when you assign a proper class to you data!)
plot(jitter(as.numeric(sex)), y)
plot(jitter(as.numeric(group)), y)
# or we can add some jitter to our data
plot(data.frame(y, age, income, sex=jitter(as.numeric(sex)),
group=jitter(as.numeric(group))))
# All in one single plot
### Include variables into a regression
# If we want to include 'age' and its square in a regression we don't need
# anymore to create an specific variable. We can use: I(age)^2
lm(y ~ age + I(age^2))
# When interactions are required:
# age*sex will include both terms and its interaction
# age:sex will include only the interaction term
lm(y ~ age * sex)
# If we want to include dummies for, say, every year in 'age', we can simply
# use 'as.factor(age)'
lm(y ~ as.factor(age))
# Finally, run the regression (print the complete results and,
# at the same time, save the regression as an object)
summary(model <- lm(y ~ age + I(age^2) + income * sex + group))