1 #!/usr/local/bin/R
  2 
  3 # Xavier Fernández i Marín
  4 # August 2006
  5 # http://xavier-fim.net
  6 
  7 # Dealing with variables
  8 # This script is very focused on treating typical variables from surveys
  9 # used in the Political and Social Sciences.
 10 
 11 
 12 # Simulate data
 13 # Simulate independent variables 
 14 obs <- 1000
 15 age <-  round(runif(obs, min=18, max=60), digits=0)
 16            # uniform distribution
 17 
 18 income <- rgamma(obs, scale=20, shape=2)
 19            # distribution skewed, with a big tail in the right
 20 
 21 sex <- gl(2, 1, length=obs, labels=c("male", "female"))
 22            # factor with only two possible outcomes
 23 
 24 group <- rep(gl(5, 1, labels=c("A", "B", "C", "D", "E")), obs/5)
 25            # factor with 5 possible outcomes
 26 
 27 
 28 # Simulate some kind of relationship with another variable
 29 y <- 5*(rep(1, obs)) +
 30      1.5*age - 0.05*age^2 +
 31      8.5*as.numeric(sex) +
 32      0.5*income +
 33      2.1*income*as.numeric(sex) +
 34      5.3*as.numeric(group)[c(2,5,4,3,1)] +
 35      rnorm(obs, mean=100, sd=15)
 36 
 37 # We have simply assign some random numbers to the variables, with:
 38 #  a nonlinear relationship between age and y
 39 #  an interaction term between income and sex
 40 #  a constant difference between 'group' of 5.3, but with no relationship
 41 #   with the order of the groups
 42 #  a constant with value 5 (we have added a vector of 1's)
 43 
 44 
 45 ### Inspect variables with an apropiate instrument
 46 hist(age)
 47 hist(income)
 48 table(sex)
 49 table(group, exclude=NULL)    # to get also missing values
 50                               # which are hidden by default
 51 
 52 
 53 
 54 ### Recode
 55 
 56 # Say that, instead of 5 groups we want to recode them into 3 groups
 57 group.2 <- rep(NA, length(group))
 58 group.2[group=="A" | group=="B"] <- "A-B"
 59 group.2[group=="C" | group=="D" | group=="E"] <- "C-D-E"
 60 group.2 <- as.factor(group.2)    # Be sure that we end up with a factor
 61 
 62 
 63 # If we include "group" into any regression it will take 'A' as the base
 64 # category.
 65 # We can change it using relevel()
 66 group <- relevel(group, ref="D")
 67 
 68 
 69 # We want to group the income variable in ranges
 70 income.discrete <- cut(income, breaks=c(0, 10, 30, 50, max(income)))
 71           # assigning ourselves the cutting points
 72 
 73 income.discrete.2 <- cut(income, breaks=5)
 74           # giving the number of intervals
 75 
 76 
 77 ### Inspect bivariate relationships between the y and all other variables
 78 plot(age, y)
 79 plot(income, y)   # simple scatterplots
 80 
 81 plot(sex, y)
 82 plot(group, y)    # yes, R knows what to do ...
 83                   #  (when you assign a proper class to you data!)
 84 
 85 plot(jitter(as.numeric(sex)), y)
 86 plot(jitter(as.numeric(group)), y)
 87                   # or we can add some jitter to our data
 88 
 89 
 90 plot(data.frame(y, age, income, sex=jitter(as.numeric(sex)),
 91                 group=jitter(as.numeric(group))))
 92                   # All in one single plot
 93 
 94 
 95 
 96 
 97 
 98 ### Include variables into a regression
 99 
100 # If we want to include 'age' and its square in a regression we don't need
101 # anymore to create an specific variable. We can use: I(age)^2
102 lm(y ~ age + I(age^2))
103 
104 
105 # When interactions are required:
106 #  age*sex   will include both terms and its interaction
107 #  age:sex   will include only the interaction term
108 lm(y ~ age * sex)
109 
110 
111 # If we want to include dummies for, say, every year in 'age', we can simply
112 # use 'as.factor(age)'
113 lm(y ~ as.factor(age))
114 
115 
116 
117 # Finally, run the regression (print the complete results and,
118 # at the same time, save the regression as an object)
119 summary(model <- lm(y ~ age + I(age^2) + income * sex + group))