#!/usr/local/bin/R # Xavier Fernández i MarÃn # August 2006 # http://xavier-fim.net # Dealing with variables # This script is very focused on treating typical variables from surveys # used in the Political and Social Sciences. # Simulate data # Simulate independent variables obs <- 1000 age <- round(runif(obs, min=18, max=60), digits=0) # uniform distribution income <- rgamma(obs, scale=20, shape=2) # distribution skewed, with a big tail in the right sex <- gl(2, 1, length=obs, labels=c("male", "female")) # factor with only two possible outcomes group <- rep(gl(5, 1, labels=c("A", "B", "C", "D", "E")), obs/5) # factor with 5 possible outcomes # Simulate some kind of relationship with another variable y <- 5*(rep(1, obs)) + 1.5*age - 0.05*age^2 + 8.5*as.numeric(sex) + 0.5*income + 2.1*income*as.numeric(sex) + 5.3*as.numeric(group)[c(2,5,4,3,1)] + rnorm(obs, mean=100, sd=15) # We have simply assign some random numbers to the variables, with: # a nonlinear relationship between age and y # an interaction term between income and sex # a constant difference between 'group' of 5.3, but with no relationship # with the order of the groups # a constant with value 5 (we have added a vector of 1's) ### Inspect variables with an apropiate instrument hist(age) hist(income) table(sex) table(group, exclude=NULL) # to get also missing values # which are hidden by default ### Recode # Say that, instead of 5 groups we want to recode them into 3 groups group.2 <- rep(NA, length(group)) group.2[group=="A" | group=="B"] <- "A-B" group.2[group=="C" | group=="D" | group=="E"] <- "C-D-E" group.2 <- as.factor(group.2) # Be sure that we end up with a factor # If we include "group" into any regression it will take 'A' as the base # category. # We can change it using relevel() group <- relevel(group, ref="D") # We want to group the income variable in ranges income.discrete <- cut(income, breaks=c(0, 10, 30, 50, max(income))) # assigning ourselves the cutting points income.discrete.2 <- cut(income, breaks=5) # giving the number of intervals ### Inspect bivariate relationships between the y and all other variables plot(age, y) plot(income, y) # simple scatterplots plot(sex, y) plot(group, y) # yes, R knows what to do ... # (when you assign a proper class to you data!) plot(jitter(as.numeric(sex)), y) plot(jitter(as.numeric(group)), y) # or we can add some jitter to our data plot(data.frame(y, age, income, sex=jitter(as.numeric(sex)), group=jitter(as.numeric(group)))) # All in one single plot ### Include variables into a regression # If we want to include 'age' and its square in a regression we don't need # anymore to create an specific variable. We can use: I(age)^2 lm(y ~ age + I(age^2)) # When interactions are required: # age*sex will include both terms and its interaction # age:sex will include only the interaction term lm(y ~ age * sex) # If we want to include dummies for, say, every year in 'age', we can simply # use 'as.factor(age)' lm(y ~ as.factor(age)) # Finally, run the regression (print the complete results and, # at the same time, save the regression as an object) summary(model <- lm(y ~ age + I(age^2) + income * sex + group))