Chapter 4. Single-Equation and OLS Estimation#
Examples#
Example 4.1. Wage equation for married working women#
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col
from wooldridge import *
df=dataWoo("mroz")
lwage_hetr =smf.ols('lwage ~ exper + expersq + educ + age + kidslt6 + kidsge6', data=df).fit()
lwage_robust =smf.ols('lwage ~ exper + expersq + educ + age + kidslt6 + kidsge6', data=df).fit(cov_type='HC1')
print(summary_col([lwage_hetr, lwage_robust],stars=True,float_format='%0.3f',
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),
'Adj.R2':lambda x: "{:.3f}".format(x.rsquared_adj)}))
library(wooldridge)
library(AER)
library(lmtest)
library(stargazer)
lwage_hetr <- lm(lwage ~ exper + expersq + educ + age + kidslt6 + kidsge6, data=mroz)
lwage_robust <- coeftest(lwage_hetr, vcovHC(lwage_hetr, type = "HC1") )
stargazer(lwage_hetr, lwage_robust, column.labels=c("Hetrosced.", "Robust SE"), no.space=TRUE, type="text")
bcuse mroz, clear nodesc
eststo ols: reg lwage exper expersq educ age kidslt6 kidsge6
eststo ols_r: reg lwage exper expersq educ age kidslt6 kidsge6, r
estout ols ols_r, cells(b(star fmt(4)) se(par fmt(4))) stats( N, fmt(%9.0g)
================================
lwage I lwage II
--------------------------------
Intercept -0.421 -0.421
(0.317) (0.318)
exper 0.040*** 0.040***
(0.013) (0.015)
expersq -0.001* -0.001*
(0.000) (0.000)
educ 0.108*** 0.108***
(0.014) (0.014)
age -0.001 -0.001
(0.005) (0.006)
kidslt6 -0.061 -0.061
(0.089) (0.106)
kidsge6 -0.015 -0.015
(0.028) (0.029)
R-squared 0.158 0.158
R-squared Adj. 0.146 0.146
N 428 428
R2 0.158 0.158
Adj.R2 0.146 0.146
================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01
print(lwage_hetr.f_test('(kidslt6=kidsge6=age=0)'))
linearHypothesis(lwage_hetr, c("kidslt6=0", "kidsge6=0", "age=0"))
test kidsge6 kidslt6 age
<F test: F=0.23702385952852514, p=0.8705394717167727, df_denom=421, df_num=3>
uhat_rst = smf.ols('lwage ~ exper + expersq + educ', data=df).fit().resid
uhat_reg = smf.ols('uhat_rst ~ exper + expersq + educ + age + kidslt6 + kidsge6', data=df.dropna()).fit()
print(uhat_reg.summary())
uhat_rst <- resid(lm(lwage ~ exper + expersq + educ, data=mroz))
summary(uhat_reg <- lm(uhat_rst ~ exper + expersq + educ + age + kidslt6 + kidsge6,
data=subset(mroz, !is.na(wage))))
reg lwage exper expersq educ
predict u_r, residual
reg u_r exper expersq educ age kidslt6 kidsge6
OLS Regression Results
==============================================================================
Dep. Variable: uhat_rst R-squared: 0.002
Model: OLS Adj. R-squared: -0.013
Method: Least Squares F-statistic: 0.1185
Date: Mon, 11 Dec 2023 Prob (F-statistic): 0.994
Time: 22:32:29 Log-Likelihood: -431.24
No. Observations: 428 AIC: 876.5
Df Residuals: 421 BIC: 904.9
Df Model: 6
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 0.1011 0.317 0.319 0.750 -0.522 0.724
exper -0.0017 0.013 -0.130 0.896 -0.028 0.025
expersq 2.996e-05 0.000 0.074 0.941 -0.001 0.001
educ 0.0003 0.014 0.024 0.981 -0.028 0.029
age -0.0015 0.005 -0.277 0.782 -0.012 0.009
kidslt6 -0.0607 0.089 -0.684 0.494 -0.235 0.114
kidsge6 -0.0146 0.028 -0.523 0.601 -0.069 0.040
==============================================================================
Omnibus: 76.463 Durbin-Watson: 1.963
Prob(Omnibus): 0.000 Jarque-Bera (JB): 301.786
Skew: -0.732 Prob(JB): 2.94e-66
Kurtosis: 6.844 Cond. No. 3.54e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.54e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
LM1 = uhat_reg.nobs * uhat_reg.rsquared
LM1
LM <- (summary(uhat_reg)$r.squared) * (nobs(uhat_reg))
LM
display "LM =" e(r2)*e(N)
0.7216757067709283
import scipy.stats as ss
ss.chi2.sf(LM1, 3)
pchisq(LM, df=3, lower.tail = FALSE)
di chi2tail(3, e(r2)*e(N))
0.8680941471323707
Example 4.1. … LM2 Continued on pp.65#
uhat_rst = smf.ols('lwage ~ exper + expersq + educ', data=df).fit().resid
df['u_age'] = smf.ols('age~educ+exper+expersq', data=df.dropna()).fit().resid*uhat_rst
df['u_kidslt6'] = smf.ols('kidslt6~educ+exper+expersq', data=df.dropna()).fit().resid*uhat_rst
df['u_kidsge6'] = smf.ols('kidsge6~educ+exper+expersq', data=df.dropna()).fit().resid*uhat_rst
df['one'] = 1 #Generates a vector of 1s.
one_r = smf.ols('one ~ u_age + u_kidslt6 + u_kidsge6 + 0', data=df.dropna()).fit()
print(one_r.summary())
uhat_rst <- resid(lm(lwage ~ exper + expersq + educ, data=mroz)) #u from the restricted model
u_age <- resid(lm(age~educ+exper+expersq, data=subset(mroz, !is.na(wage))))*uhat_rst
u_kidslt6 <- resid(lm(kidslt6~educ+exper+expersq, data=subset(mroz, !is.na(wage))))*uhat_rst
u_kidsge6 <- resid(lm(kidsge6~educ+exper+expersq, data=subset(mroz, !is.na(wage))))*uhat_rst
udata <- data.frame(cbind(u_age, u_kidslt6, u_kidsge6))
udata['one'] = 1 #Generates a vector of 1s.
summary(one_r <- lm(one ~ u_age + u_kidslt6 + u_kidsge6 + 0, data=udata))
foreach x of var age kidslt kidsg{
reg `x' exper* edu
predict r_`x', residual
gen ures`x'= u_r*r_`x'
}
gen one=1
reg one ures*, noc
OLS Regression Results
=======================================================================================
Dep. Variable: one R-squared (uncentered): 0.001
Model: OLS Adj. R-squared (uncentered): -0.006
Method: Least Squares F-statistic: 0.1696
Date: Mon, 11 Dec 2023 Prob (F-statistic): 0.917
Time: 22:32:29 Log-Likelihood: -607.05
No. Observations: 428 AIC: 1220.
Df Residuals: 425 BIC: 1232.
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
u_age -0.0025 0.011 -0.234 0.815 -0.024 0.019
u_kidslt6 -0.0905 0.169 -0.535 0.593 -0.423 0.242
u_kidsge6 -0.0267 0.060 -0.443 0.658 -0.145 0.092
==============================================================================
Omnibus: 177.849 Durbin-Watson: 0.002
Prob(Omnibus): 0.000 Jarque-Bera (JB): 7632.953
Skew: 1.016 Prob(JB): 0.00
Kurtosis: 23.588 Cond. No. 17.3
==============================================================================
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
LM2 = one_r.nobs - one_r.ssr
LM2
LM2 <- (nobs(one_r))-sum(resid(one_r)^2)
LM2
display "LM2 = N-SSRo =" e(N)-e(rss)
0.511880062357136
ss.chi2.sf(LM2, 3)
pchisq(LM2, df=3, lower.tail = FALSE)
di chi2tail(3, e(N)-e(rss))
0.9162737684759361
Example 4.3 Using IQ as a Proxy for Ability#
df=pd.read_csv("nls80.csv")
print(smf.ols('lwage ~ exper + tenure + married + south + urban + black + educ', data=df).fit().summary())
nls80 <- read_dta("nls80.dta")
summary(lm(lwage ~ exper + tenure + married + south + urban + black + educ, data=nls80))
bcuse nls80, clear nodesc
reg lwage exper tenure married south urban black educ
OLS Regression Results
==============================================================================
Dep. Variable: lwage R-squared: 0.253
Model: OLS Adj. R-squared: 0.247
Method: Least Squares F-statistic: 44.75
Date: Mon, 11 Dec 2023 Prob (F-statistic): 1.16e-54
Time: 22:32:29 Log-Likelihood: -381.55
No. Observations: 935 AIC: 779.1
Df Residuals: 927 BIC: 817.8
Df Model: 7
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 5.3955 0.113 47.653 0.000 5.173 5.618
exper 0.0140 0.003 4.409 0.000 0.008 0.020
tenure 0.0117 0.002 4.789 0.000 0.007 0.017
married 0.1994 0.039 5.107 0.000 0.123 0.276
south -0.0909 0.026 -3.463 0.001 -0.142 -0.039
urban 0.1839 0.027 6.822 0.000 0.131 0.237
black -0.1883 0.038 -5.000 0.000 -0.262 -0.114
educ 0.0654 0.006 10.468 0.000 0.053 0.078
==============================================================================
Omnibus: 38.227 Durbin-Watson: 1.823
Prob(Omnibus): 0.000 Jarque-Bera (JB): 83.390
Skew: -0.224 Prob(JB): 7.80e-19
Kurtosis: 4.393 Cond. No. 187.
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(smf.ols('lwage ~ exper + tenure + married + south + urban + black + educ + iq',
data=df).fit().summary())```
summary(lm(lwage ~ exper + tenure + married + south + urban + black + educ + iq, data=nls80))
reg lwage exper tenure married south urban black educ iq
OLS Regression Results
==============================================================================
Dep. Variable: lwage R-squared: 0.263
Model: OLS Adj. R-squared: 0.256
Method: Least Squares F-statistic: 41.27
Date: Mon, 11 Dec 2023 Prob (F-statistic): 1.52e-56
Time: 22:32:29 Log-Likelihood: -375.09
No. Observations: 935 AIC: 768.2
Df Residuals: 926 BIC: 811.7
Df Model: 8
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 5.1764 0.128 40.441 0.000 4.925 5.428
exper 0.0141 0.003 4.469 0.000 0.008 0.020
tenure 0.0114 0.002 4.671 0.000 0.007 0.016
married 0.1998 0.039 5.148 0.000 0.124 0.276
south -0.0802 0.026 -3.054 0.002 -0.132 -0.029
urban 0.1819 0.027 6.791 0.000 0.129 0.235
black -0.1431 0.039 -3.624 0.000 -0.221 -0.066
educ 0.0544 0.007 7.853 0.000 0.041 0.068
iq 0.0036 0.001 3.589 0.000 0.002 0.006
==============================================================================
Omnibus: 43.456 Durbin-Watson: 1.820
Prob(Omnibus): 0.000 Jarque-Bera (JB): 99.739
Skew: -0.248 Prob(JB): 2.20e-22
Kurtosis: 4.521 Cond. No. 1.13e+03
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.13e+03. This might indicate that there are
strong multicollinearity or other numerical problems.
Example 4.4 Effects of Job Training Grants on Worker Productivity#
df = dataWoo("jtrain")
df = df[(df['year']==1988)]
print(smf.ols('lscrap ~ grant', data=df).fit().summary())
summary(lm(lscrap ~ grant, data=subset(jtrain, jtrain$year==1988)))
use jtrain1, clear
regress lscrap grant if year==1988
OLS Regression Results
==============================================================================
Dep. Variable: lscrap R-squared: 0.000
Model: OLS Adj. R-squared: -0.019
Method: Least Squares F-statistic: 0.01948
Date: Mon, 11 Dec 2023 Prob (F-statistic): 0.890
Time: 22:32:29 Log-Likelihood: -94.660
No. Observations: 54 AIC: 193.3
Df Residuals: 52 BIC: 197.3
Df Model: 1
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 0.4085 0.241 1.698 0.095 -0.074 0.891
grant 0.0566 0.406 0.140 0.890 -0.757 0.870
==============================================================================
Omnibus: 0.074 Durbin-Watson: 1.931
Prob(Omnibus): 0.963 Jarque-Bera (JB): 0.086
Skew: -0.067 Prob(JB): 0.958
Kurtosis: 2.859 Cond. No. 2.42
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
print(smf.ols('lscrap ~ grant+lscrap_1', data=df).fit().summary())
summary(lm(lscrap ~ grant + lscrap_1, data=subset(jtrain, jtrain$year==1988)))
regress lscrap grant lscrap_1 if year==1988
OLS Regression Results
==============================================================================
Dep. Variable: lscrap R-squared: 0.873
Model: OLS Adj. R-squared: 0.868
Method: Least Squares F-statistic: 174.9
Date: Mon, 11 Dec 2023 Prob (F-statistic): 1.47e-23
Time: 22:32:29 Log-Likelihood: -39.000
No. Observations: 54 AIC: 84.00
Df Residuals: 51 BIC: 89.97
Df Model: 2
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 0.0212 0.089 0.238 0.813 -0.158 0.200
grant -0.2540 0.147 -1.727 0.090 -0.549 0.041
lscrap_1 0.8312 0.044 18.701 0.000 0.742 0.920
==============================================================================
Omnibus: 13.769 Durbin-Watson: 1.541
Prob(Omnibus): 0.001 Jarque-Bera (JB): 35.075
Skew: -0.526 Prob(JB): 2.42e-08
Kurtosis: 6.805 Cond. No. 3.95
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
Example 4.5 Return to Education Depends on Ability#
df=pd.read_csv("nls80.csv")
nls_reg = smf.ols('lwage ~ exper + tenure + married + south + urban + black + educ*iq', data=df).fit()
print(nls_reg.summary())
summary(lwage_rg <- lm(lwage ~ exper + tenure + married + south + urban + black + educ*iq, data=nls80) )
bcuse nls80, clear nodesc
reg lwage exper tenure married south urban black educ iq c.educ#c.iq
OLS Regression Results
==============================================================================
Dep. Variable: lwage R-squared: 0.263
Model: OLS Adj. R-squared: 0.256
Method: Least Squares F-statistic: 36.76
Date: Mon, 11 Dec 2023 Prob (F-statistic): 6.85e-56
Time: 22:32:29 Log-Likelihood: -374.69
No. Observations: 935 AIC: 769.4
Df Residuals: 925 BIC: 817.8
Df Model: 9
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 5.6482 0.546 10.339 0.000 4.576 6.720
exper 0.0139 0.003 4.378 0.000 0.008 0.020
tenure 0.0114 0.002 4.670 0.000 0.007 0.016
married 0.2009 0.039 5.173 0.000 0.125 0.277
south -0.0802 0.026 -3.056 0.002 -0.132 -0.029
urban 0.1836 0.027 6.835 0.000 0.131 0.236
black -0.1467 0.040 -3.695 0.000 -0.225 -0.069
educ 0.0185 0.041 0.449 0.653 -0.062 0.099
iq -0.0009 0.005 -0.182 0.855 -0.011 0.009
educ:iq 0.0003 0.000 0.888 0.375 -0.000 0.001
==============================================================================
Omnibus: 43.073 Durbin-Watson: 1.822
Prob(Omnibus): 0.000 Jarque-Bera (JB): 98.942
Skew: -0.245 Prob(JB): 3.27e-22
Kurtosis: 4.517 Cond. No. 6.62e+04
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 6.62e+04. This might indicate that there are
strong multicollinearity or other numerical problems.
print(nls_reg.f_test('(educ:iq=iq=0)'))
linearHypothesis(lwage_rg, c("educ:iq =0", "iq=0"))
test iq c.educ#c.iq
<F test: F=6.83181678824927, p=0.0011341745453863027, df_denom=925, df_num=2>