1 #!/usr/local/bin/R 2 3 # Xavier Fernández i Marín 4 # August 2006 5 # http://xavier-fim.net 6 7 # Dealing with variables 8 # This script is very focused on treating typical variables from surveys 9 # used in the Political and Social Sciences. 10 11 12 # Simulate data 13 # Simulate independent variables 14 obs <- 1000 15 age <- round(runif(obs, min=18, max=60), digits=0) 16 # uniform distribution 17 18 income <- rgamma(obs, scale=20, shape=2) 19 # distribution skewed, with a big tail in the right 20 21 sex <- gl(2, 1, length=obs, labels=c("male", "female")) 22 # factor with only two possible outcomes 23 24 group <- rep(gl(5, 1, labels=c("A", "B", "C", "D", "E")), obs/5) 25 # factor with 5 possible outcomes 26 27 28 # Simulate some kind of relationship with another variable 29 y <- 5*(rep(1, obs)) + 30 1.5*age - 0.05*age^2 + 31 8.5*as.numeric(sex) + 32 0.5*income + 33 2.1*income*as.numeric(sex) + 34 5.3*as.numeric(group)[c(2,5,4,3,1)] + 35 rnorm(obs, mean=100, sd=15) 36 37 # We have simply assign some random numbers to the variables, with: 38 # a nonlinear relationship between age and y 39 # an interaction term between income and sex 40 # a constant difference between 'group' of 5.3, but with no relationship 41 # with the order of the groups 42 # a constant with value 5 (we have added a vector of 1's) 43 44 45 ### Inspect variables with an apropiate instrument 46 hist(age) 47 hist(income) 48 table(sex) 49 table(group, exclude=NULL) # to get also missing values 50 # which are hidden by default 51 52 53 54 ### Recode 55 56 # Say that, instead of 5 groups we want to recode them into 3 groups 57 group.2 <- rep(NA, length(group)) 58 group.2[group=="A" | group=="B"] <- "A-B" 59 group.2[group=="C" | group=="D" | group=="E"] <- "C-D-E" 60 group.2 <- as.factor(group.2) # Be sure that we end up with a factor 61 62 63 # If we include "group" into any regression it will take 'A' as the base 64 # category. 65 # We can change it using relevel() 66 group <- relevel(group, ref="D") 67 68 69 # We want to group the income variable in ranges 70 income.discrete <- cut(income, breaks=c(0, 10, 30, 50, max(income))) 71 # assigning ourselves the cutting points 72 73 income.discrete.2 <- cut(income, breaks=5) 74 # giving the number of intervals 75 76 77 ### Inspect bivariate relationships between the y and all other variables 78 plot(age, y) 79 plot(income, y) # simple scatterplots 80 81 plot(sex, y) 82 plot(group, y) # yes, R knows what to do ... 83 # (when you assign a proper class to you data!) 84 85 plot(jitter(as.numeric(sex)), y) 86 plot(jitter(as.numeric(group)), y) 87 # or we can add some jitter to our data 88 89 90 plot(data.frame(y, age, income, sex=jitter(as.numeric(sex)), 91 group=jitter(as.numeric(group)))) 92 # All in one single plot 93 94 95 96 97 98 ### Include variables into a regression 99 100 # If we want to include 'age' and its square in a regression we don't need 101 # anymore to create an specific variable. We can use: I(age)^2 102 lm(y ~ age + I(age^2)) 103 104 105 # When interactions are required: 106 # age*sex will include both terms and its interaction 107 # age:sex will include only the interaction term 108 lm(y ~ age * sex) 109 110 111 # If we want to include dummies for, say, every year in 'age', we can simply 112 # use 'as.factor(age)' 113 lm(y ~ as.factor(age)) 114 115 116 117 # Finally, run the regression (print the complete results and, 118 # at the same time, save the regression as an object) 119 summary(model <- lm(y ~ age + I(age^2) + income * sex + group))