# Chapter 5. IV estimation of Single-Equation Linear Models#

## Examples#

import pandas as pd
import scipy.stats as ss

import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col

from wooldridge import *
from linearmodels.iv import IV2SLS


### Example 5.3 Parents’ and Husband’s Education as IVs#

df=dataWoo("mroz").dropna(subset=['lwage'])
print(smf.ols('lwage ~ exper + expersq + educ', data=df).fit().summary())

                            OLS Regression Results
==============================================================================
Dep. Variable:                  lwage   R-squared:                       0.157
Method:                 Least Squares   F-statistic:                     26.29
Date:                Mon, 11 Dec 2023   Prob (F-statistic):           1.30e-15
Time:                        22:32:33   Log-Likelihood:                -431.60
No. Observations:                 428   AIC:                             871.2
Df Residuals:                     424   BIC:                             887.4
Df Model:                           3
Covariance Type:            nonrobust
==============================================================================
coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.5220      0.199     -2.628      0.009      -0.912      -0.132
exper          0.0416      0.013      3.155      0.002       0.016       0.067
expersq       -0.0008      0.000     -2.063      0.040      -0.002   -3.82e-05
educ           0.1075      0.014      7.598      0.000       0.080       0.135
==============================================================================
Omnibus:                       77.792   Durbin-Watson:                   1.961
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              300.917
Skew:                          -0.753   Prob(JB):                     4.54e-66
Kurtosis:                       6.822   Cond. No.                     2.21e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 2.21e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

IV1 = IV2SLS.from_formula(
'lwage ~ 1 + exper + expersq + [educ ~ motheduc + fatheduc + huseduc]',
data=df.dropna(subset=['lwage'])).fit()
print(IV1)

                          IV-2SLS Estimation Summary
==============================================================================
Dep. Variable:                  lwage   R-squared:                      0.1495
No. Observations:                 428   F-statistic:                    27.835
Date:                Mon, Dec 11 2023   P-value (F-stat)                0.0000
Time:                        22:32:33   Distribution:                  chi2(3)
Cov. Estimator:                robust

Parameter Estimates
==============================================================================
Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept     -0.1869     0.2999    -0.6232     0.5332     -0.7746      0.4008
exper          0.0431     0.0152     2.8289     0.0047      0.0132      0.0730
expersq       -0.0009     0.0004    -2.0558     0.0398     -0.0017  -4.023e-05
educ           0.0804     0.0216     3.7216     0.0002      0.0381      0.1227
==============================================================================

Endogenous: educ
Instruments: motheduc, fatheduc, huseduc
Robust Covariance (Heteroskedastic)
Debiased: False

stage1 = smf.ols('educ ~motheduc + fatheduc + huseduc + exper + expersq', data=df.dropna()).fit()
print(stage1.summary())

                            OLS Regression Results
==============================================================================
Dep. Variable:                   educ   R-squared:                       0.429
Method:                 Least Squares   F-statistic:                     63.30
Date:                Mon, 11 Dec 2023   Prob (F-statistic):           3.43e-49
Time:                        22:32:33   Log-Likelihood:                -840.80
No. Observations:                 428   AIC:                             1694.
Df Residuals:                     422   BIC:                             1718.
Df Model:                           5
Covariance Type:            nonrobust
==============================================================================
coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      5.5383      0.460     12.046      0.000       4.635       6.442
motheduc       0.1142      0.031      3.708      0.000       0.054       0.175
fatheduc       0.1061      0.030      3.594      0.000       0.048       0.164
huseduc        0.3753      0.030     12.663      0.000       0.317       0.434
exper          0.0375      0.034      1.093      0.275      -0.030       0.105
expersq       -0.0006      0.001     -0.585      0.559      -0.003       0.001
==============================================================================
Omnibus:                        7.891   Durbin-Watson:                   1.941
Prob(Omnibus):                  0.019   Jarque-Bera (JB):               11.619
Skew:                          -0.105   Prob(JB):                      0.00300
Kurtosis:                       3.780   Cond. No.                     1.96e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.96e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

print(stage1.f_test('(motheduc=fatheduc=huseduc=0)'))

<F test: F=104.29424463273558, p=1.5857824440176843e-50, df_denom=422, df_num=3>


### Example 5.4 Parents’ and Husband’s Education as IVs, cont’d#

IV2= IV2SLS.from_formula(
'lwage ~ 1 + exper + expersq + [educ ~ motheduc + fatheduc + huseduc]+ kidslt6 + kidsge6',
print(IV2)

                          IV-2SLS Estimation Summary
==============================================================================
Dep. Variable:                  lwage   R-squared:                      0.1505
No. Observations:                 428   F-statistic:                    35.923
Date:                Mon, Dec 11 2023   P-value (F-stat)                0.0000
Time:                        22:32:33   Distribution:                  chi2(5)

Parameter Estimates
==============================================================================
Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept     -0.1315     0.3017    -0.4359     0.6629     -0.7229      0.4598
exper          0.0415     0.0134     3.1080     0.0019      0.0153      0.0677
expersq       -0.0009     0.0004    -2.1743     0.0297     -0.0016  -8.456e-05
kidslt6       -0.0313     0.0855    -0.3665     0.7140     -0.1989      0.1362
kidsge6       -0.0182     0.0270    -0.6761     0.4990     -0.0710      0.0346
educ           0.0799     0.0222     3.5976     0.0003      0.0364      0.1234
==============================================================================

Endogenous: educ
Instruments: motheduc, fatheduc, huseduc
Debiased: False

uhat = IV1.resids
IV3= IV2SLS.from_formula(
'uhat ~ 1 + exper + expersq + [educ ~ motheduc + fatheduc + huseduc]+ kidslt6 + kidsge6',
data=df.dropna(subset=['wage'])).fit()
print(IV3)

                          IV-2SLS Estimation Summary
==============================================================================
Dep. Variable:                   uhat   R-squared:                      0.0011
No. Observations:                 428   F-statistic:                    0.5040
Date:                Mon, Dec 11 2023   P-value (F-stat)                0.9920
Time:                        22:32:33   Distribution:                  chi2(5)
Cov. Estimator:                robust

Parameter Estimates
==============================================================================
Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept      0.0553     0.3184     0.1738     0.8620     -0.5687      0.6793
exper         -0.0016     0.0153    -0.1045     0.9167     -0.0317      0.0285
expersq     5.169e-06     0.0004     0.0124     0.9901     -0.0008      0.0008
kidslt6       -0.0313     0.1007    -0.3112     0.7556     -0.2287      0.1660
kidsge6       -0.0182     0.0282    -0.6471     0.5175     -0.0734      0.0370
educ          -0.0005     0.0223    -0.0235     0.9813     -0.0442      0.0432
==============================================================================

Endogenous: educ
Instruments: motheduc, fatheduc, huseduc
Robust Covariance (Heteroskedastic)
Debiased: False

v2 = smf.ols('educ ~ motheduc + fatheduc + huseduc + exper + expersq + kidslt6 + kidsge6', data=df).fit().resid
IV2f = smf.ols('lwage ~ 1 + educ + exper + expersq  + kidslt6 + kidsge6 + v2 ', data = df).fit()
print(IV2f.f_test('(kidslt6=kidsge6=0)'))

<F test: F=0.30975430489632627, p=0.7337942346879394, df_denom=421, df_num=2>

LM1 = IV3.nobs * IV3.rsquared
P = ss.chi2.sf(LM1, 2)
print(LM1, P)

0.47737716303172517 0.7876601361080349

# Robust Covariance (Heteroskedastic)
IV2= IV2SLS.from_formula(
'lwage ~ 1 + exper + expersq + [educ ~ motheduc + fatheduc + huseduc]+ kidslt6 + kidsge6',
data=df).fit()
print(IV2)

                          IV-2SLS Estimation Summary
==============================================================================
Dep. Variable:                  lwage   R-squared:                      0.1505
No. Observations:                 428   F-statistic:                    30.243
Date:                Mon, Dec 11 2023   P-value (F-stat)                0.0000
Time:                        22:32:33   Distribution:                  chi2(5)
Cov. Estimator:                robust

Parameter Estimates
==============================================================================
Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept     -0.1315     0.3184    -0.4131     0.6795     -0.7555      0.4925
exper          0.0415     0.0153     2.7054     0.0068      0.0114      0.0716
expersq       -0.0009     0.0004    -2.0505     0.0403     -0.0017  -3.789e-05
kidslt6       -0.0313     0.1007    -0.3112     0.7556     -0.2287      0.1660
kidsge6       -0.0182     0.0282    -0.6471     0.5175     -0.0734      0.0370
educ           0.0799     0.0223     3.5810     0.0003      0.0362      0.1236
==============================================================================

Endogenous: educ
Instruments: motheduc, fatheduc, huseduc
Robust Covariance (Heteroskedastic)
Debiased: False


#### LM test page 107#

df['uhat'] = IV2SLS.from_formula(
'lwage ~ 1 + exper + expersq + [educ ~ motheduc + fatheduc + huseduc]',
data=df.dropna(subset=['lwage'])).fit().resids
df['u_klt6'] = IV2SLS.from_formula(
'kidslt6 ~ 1 + exper + expersq + [educ ~ motheduc + fatheduc + huseduc]',
data=df.dropna(subset=['lwage'])).fit().resids
df['u_kage6'] = IV2SLS.from_formula(
'kidsge6  ~ 1 + exper + expersq + [educ ~ motheduc + fatheduc + huseduc]',
data=df.dropna(subset=['lwage'])).fit().resids
df['one']=1
long = smf.ols('one ~ u_klt6:uhat + u_kage6:uhat + 0', data=df.dropna(subset=['lwage'])).fit()
print(long.summary())

                                 OLS Regression Results
=======================================================================================
Dep. Variable:                    one   R-squared (uncentered):                   0.001
Model:                            OLS   Adj. R-squared (uncentered):             -0.004
Method:                 Least Squares   F-statistic:                             0.2330
Date:                Mon, 11 Dec 2023   Prob (F-statistic):                       0.792
Time:                        22:32:33   Log-Likelihood:                         -607.07
No. Observations:                 428   AIC:                                      1218.
Df Residuals:                     426   BIC:                                      1226.
Df Model:                           2
Covariance Type:            nonrobust
================================================================================
coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
u_klt6:uhat     -0.0848      0.163     -0.520      0.603      -0.405       0.236
u_kage6:uhat    -0.0218      0.058     -0.379      0.705      -0.135       0.091
==============================================================================
Omnibus:                      130.945   Durbin-Watson:                   0.002
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             6061.081
Skew:                           0.439   Prob(JB):                         0.00
Kurtosis:                      21.415   Cond. No.                         2.85
==============================================================================

Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.

LM1 = long.nobs * long.rsquared
P = ss.chi2.sf(LM1, 2)
print(LM1, P)

0.46762334065794464 0.7915108668409525


### Example 5.5 IQ and KWW as indicator of Ability#

df=pd.read_csv("nls80.csv")
print(IV2SLS.from_formula(
'lwage ~ 1 + exper + tenure + married + south + urban + black + educ + [iq~kww]',
data=df).fit())

                          IV-2SLS Estimation Summary
==============================================================================
Dep. Variable:                  lwage   R-squared:                      0.1900
No. Observations:                 935   F-statistic:                    356.33
Date:                Mon, Dec 11 2023   P-value (F-stat)                0.0000
Time:                        22:32:34   Distribution:                  chi2(8)
Cov. Estimator:                robust

Parameter Estimates
==============================================================================
Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept      4.5925     0.3501     13.117     0.0000      3.9063      5.2786
exper          0.0144     0.0034     4.2234     0.0000      0.0077      0.0211
tenure         0.0105     0.0028     3.7258     0.0002      0.0050      0.0160
married        0.2007     0.0404     4.9616     0.0000      0.1214      0.2800
south         -0.0516     0.0339    -1.5201     0.1285     -0.1180      0.0149
urban          0.1767     0.0274     6.4470     0.0000      0.1230      0.2304
black         -0.0226     0.0798    -0.2826     0.7775     -0.1790      0.1339
educ           0.0250     0.0187     1.3410     0.1799     -0.0116      0.0616
iq             0.0130     0.0055     2.3835     0.0171      0.0023      0.0238
==============================================================================

Endogenous: iq
Instruments: kww
Robust Covariance (Heteroskedastic)
Debiased: False