-
Notifications
You must be signed in to change notification settings - Fork 1
/
9-heckman.Rmd
143 lines (106 loc) · 3.05 KB
/
9-heckman.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
---
title: "R Notebook"
output: html_notebook
---
```{r}
install.packages(c("sampleSelection", "GJRM"))
library(data.table)
library(tidyverse)
library(haven)
library(sampleSelection)
library(GJRM)
library(maxLik)
```
### women wages
```{r}
women <- read_stata("../data/womenwk.dta")
women$selection <- !is.na(women$wage)
head(women)
```
```{r}
m0 <- glm(wage ~ education + age, data = women, subset = selection)
summary(m0)
```
sampleSelection package
```{r}
m1 <- heckit(selection ~ married + children + education + age,
wage ~ education + age,
data = women)
summary(m1)
```
```{r}
m2 <- heckit(selection ~ married + children + education + age,
wage ~ education + age,
data = women,
method = "ml")
summary(m2)
```
```{r}
m3 <- gjrm(
formula = list(selection ~ married + children+ education +age,
wage ~ education + age),
data = women,
Model = "BSS",
margin = c("probit", "N")
)
summary(m3)
```
We can also account for the correlation within county using `lmtest` and `sandwich` estimator.
```{r}
coeftest(m2, vcov = vcovCL(m2, cluster = women$county))
```
## Heckman' -- using ML
Source: https://www.stata.com/manuals13/rheckman.pdf
```{r}
ll_heckman <- function(params, xs, xo, sel, y) {
ncol_xs <- ncol(xs)
ncol_xo <- ncol(xo)
beta_s <- params[1:ncol_xs]
beta_o <- params[(ncol_xs+1):(ncol_xs+ncol_xo)]
rho <- params[(ncol_xs+ncol_xo+1)]
sigma <- params[ncol_xs+ncol_xo+2]
ll_s <- log(pnorm( -xs[!sel,] %*% beta_s))
ll_o <- log( pnorm( (xs[sel,] %*% beta_s + rho/sigma*(y - xo %*% beta_o)) / (sqrt(1-rho^2)) ) ) -
1/2*( (y - xo %*% beta_o)/sigma )^2 - log(sqrt(2*pi)*sigma)
l <- sum(ll_s) + sum(ll_o)
l
}
```
```{r}
women$wage <- ifelse(is.na(women$wage), 0, women$wage)
m1 <- glm(selection ~ married + children + education +age,
data = women,
family = binomial("probit"))
m2 <- glm(wage ~education + age,
data = women,
subset = selection)
params <- c(coef(m1), coef(m2), rho = 0.5, sigma = sd(m2$y))
xs <- model.matrix(m1)
xo <- model.matrix(m2)
sel <- women$wage > 0
y <- m2$y
ll_heckman(params, xs, xo, sel, y)
```
```{r}
ml <- maxLik(logLik = ll_heckman, start = params, xs = xs, xo = xo, sel = sel, y= y, method = "NR")
summary(ml)
```
Comparison with SampleSelection
```
Probit selection equation:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.491015 0.189340 -13.156 < 2e-16 ***
married 0.445171 0.067395 6.605 5.07e-11 ***
children 0.438707 0.027783 15.791 < 2e-16 ***
education 0.055732 0.010735 5.192 2.30e-07 ***
age 0.036510 0.004153 8.790 < 2e-16 ***
Outcome equation:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.48579 1.07704 0.451 0.652
education 0.98995 0.05326 18.588 <2e-16 ***
age 0.21313 0.02060 10.345 <2e-16 ***
Error terms:
Estimate Std. Error t value Pr(>|t|)
sigma 6.00479 0.16572 36.23 <2e-16 ***
rho 0.70350 0.05123 13.73 <2e-16 ***
```