#!/usr/local/bin/R

# Xavier Fernández i Marín
# August 2006
# http://xavier-fim.net

# Dealing with variables
# This script is very focused on treating typical variables from surveys
# used in the Political and Social Sciences.


# Simulate data
# Simulate independent variables 
obs <- 1000
age <-  round(runif(obs, min=18, max=60), digits=0)
           # uniform distribution

income <- rgamma(obs, scale=20, shape=2)
           # distribution skewed, with a big tail in the right

sex <- gl(2, 1, length=obs, labels=c("male", "female"))
           # factor with only two possible outcomes

group <- rep(gl(5, 1, labels=c("A", "B", "C", "D", "E")), obs/5)
           # factor with 5 possible outcomes


# Simulate some kind of relationship with another variable
y <- 5*(rep(1, obs)) +
     1.5*age - 0.05*age^2 +
     8.5*as.numeric(sex) +
     0.5*income +
     2.1*income*as.numeric(sex) +
     5.3*as.numeric(group)[c(2,5,4,3,1)] +
     rnorm(obs, mean=100, sd=15)

# We have simply assign some random numbers to the variables, with:
#  a nonlinear relationship between age and y
#  an interaction term between income and sex
#  a constant difference between 'group' of 5.3, but with no relationship
#   with the order of the groups
#  a constant with value 5 (we have added a vector of 1's)


### Inspect variables with an apropiate instrument
hist(age)
hist(income)
table(sex)
table(group, exclude=NULL)    # to get also missing values
                              # which are hidden by default



### Recode

# Say that, instead of 5 groups we want to recode them into 3 groups
group.2 <- rep(NA, length(group))
group.2[group=="A" | group=="B"] <- "A-B"
group.2[group=="C" | group=="D" | group=="E"] <- "C-D-E"
group.2 <- as.factor(group.2)    # Be sure that we end up with a factor


# If we include "group" into any regression it will take 'A' as the base
# category.
# We can change it using relevel()
group <- relevel(group, ref="D")


# We want to group the income variable in ranges
income.discrete <- cut(income, breaks=c(0, 10, 30, 50, max(income)))
          # assigning ourselves the cutting points

income.discrete.2 <- cut(income, breaks=5)
          # giving the number of intervals


### Inspect bivariate relationships between the y and all other variables
plot(age, y)
plot(income, y)   # simple scatterplots

plot(sex, y)
plot(group, y)    # yes, R knows what to do ...
                  #  (when you assign a proper class to you data!)

plot(jitter(as.numeric(sex)), y)
plot(jitter(as.numeric(group)), y)
                  # or we can add some jitter to our data


plot(data.frame(y, age, income, sex=jitter(as.numeric(sex)),
                group=jitter(as.numeric(group))))
                  # All in one single plot





### Include variables into a regression

# If we want to include 'age' and its square in a regression we don't need
# anymore to create an specific variable. We can use: I(age)^2
lm(y ~ age + I(age^2))


# When interactions are required:
#  age*sex   will include both terms and its interaction
#  age:sex   will include only the interaction term
lm(y ~ age * sex)


# If we want to include dummies for, say, every year in 'age', we can simply
# use 'as.factor(age)'
lm(y ~ as.factor(age))



# Finally, run the regression (print the complete results and,
# at the same time, save the regression as an object)
summary(model <- lm(y ~ age + I(age^2) + income * sex + group))