# Chapter 6. Additional Single-Equation Topics#

## Examples#

import pandas as pd
import scipy.stats as ss

import statsmodels.formula.api as smf
from statsmodels.iolib.summary2 import summary_col

from wooldridge import *


### Example 6.1 Testing for endogenity of educ in wage equation#

df=dataWoo("mroz").dropna(subset=['wage'])

OLS1 = smf.ols('educ ~ exper + expersq + motheduc + fatheduc + huseduc',
data=df).fit()
print(OLS1.summary())

                            OLS Regression Results
==============================================================================
Dep. Variable:                   educ   R-squared:                       0.429
Model:                            OLS   Adj. R-squared:                  0.422
Method:                 Least Squares   F-statistic:                     63.30
Date:                Mon, 11 Dec 2023   Prob (F-statistic):           3.43e-49
Time:                        22:32:41   Log-Likelihood:                -840.80
No. Observations:                 428   AIC:                             1694.
Df Residuals:                     422   BIC:                             1718.
Df Model:                           5
Covariance Type:            nonrobust
==============================================================================
coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      5.5383      0.460     12.046      0.000       4.635       6.442
exper          0.0375      0.034      1.093      0.275      -0.030       0.105
expersq       -0.0006      0.001     -0.585      0.559      -0.003       0.001
motheduc       0.1142      0.031      3.708      0.000       0.054       0.175
fatheduc       0.1061      0.030      3.594      0.000       0.048       0.164
huseduc        0.3753      0.030     12.663      0.000       0.317       0.434
==============================================================================
Omnibus:                        7.891   Durbin-Watson:                   1.941
Prob(Omnibus):                  0.019   Jarque-Bera (JB):               11.619
Skew:                          -0.105   Prob(JB):                      0.00300
Kurtosis:                       3.780   Cond. No.                     1.96e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.96e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

v2=OLS1.resid
print(smf.ols('lwage ~ exper + expersq + educ + v2',
data=df).fit().summary())

                            OLS Regression Results
==============================================================================
Dep. Variable:                  lwage   R-squared:                       0.162
Model:                            OLS   Adj. R-squared:                  0.154
Method:                 Least Squares   F-statistic:                     20.48
Date:                Mon, 11 Dec 2023   Prob (F-statistic):           1.94e-15
Time:                        22:32:41   Log-Likelihood:                -430.22
No. Observations:                 428   AIC:                             870.4
Df Residuals:                     423   BIC:                             890.7
Df Model:                           4
Covariance Type:            nonrobust
==============================================================================
coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -0.1869      0.284     -0.659      0.510      -0.744       0.371
exper          0.0431      0.013      3.270      0.001       0.017       0.069
expersq       -0.0009      0.000     -2.192      0.029      -0.002    -8.9e-05
educ           0.0804      0.022      3.716      0.000       0.038       0.123
v2             0.0472      0.029      1.653      0.099      -0.009       0.103
==============================================================================
Omnibus:                       77.575   Durbin-Watson:                   1.953
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              295.046
Skew:                          -0.756   Prob(JB):                     8.54e-65
Kurtosis:                       6.776   Cond. No.                     3.17e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 3.17e+03. This might indicate that there are
strong multicollinearity or other numerical problems.


### Example 6.2#

df=dataWoo("card")
OLS1 = smf.ols('lwage ~ educ*black +exper + expersq + smsa + smsa66 + south + reg661+ reg662+ reg663+ reg664+ reg665+ reg666+ reg667+ reg668', data=df).fit()
OLS2 = smf.ols('educ ~ black*nearc4 +exper + expersq + smsa + smsa66 + south +  reg661+ reg662+ reg663+ reg664+ reg665+ reg666+ reg667+ reg668', data=df).fit()
v21 = OLS2.resid

df['b_educ'] = df.educ * df.black
OLS3 = smf.ols('b_educ ~ exper + expersq + black*nearc4 + smsa + smsa66 + south + reg661+ reg662+ reg663+ reg664+ reg665+ reg666+ reg667+ reg668',data=df).fit()
v22 = OLS3.resid
OLS4 = smf.ols('lwage ~ v21 + v22 + educ*black +exper + expersq + smsa + smsa66 + south + reg661+ reg662+ reg663+ reg664+ reg665+ reg666+ reg667+ +reg668', data=df).fit()

print(summary_col([OLS1, OLS2, OLS3, OLS4],stars=True,float_format='%0.3f',
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),

======================================================
lwage I    educ I   b_educ I  lwage II
------------------------------------------------------
Intercept      4.807***  16.849*** 0.095     3.845***
(0.075)   (0.215)   (0.127)   (0.931)
R-squared      0.302     0.477     0.952     0.302
R-squared Adj. 0.298     0.474     0.951     0.298
black          -0.419*** -0.937*** 11.550*** -0.283
(0.079)   (0.148)   (0.088)   (0.487)
black:nearc4             0.003     0.875***
(0.177)   (0.105)
educ           0.071***                      0.127**
(0.004)                       (0.055)
educ:black     0.018***                      0.011
(0.006)                       (0.039)
exper          0.082***  -0.413*** 0.053***  0.106***
(0.007)   (0.034)   (0.020)   (0.024)
expersq        -0.002*** 0.001     -0.008*** -0.002***
(0.000)   (0.002)   (0.001)   (0.000)
nearc4                   0.319***  -0.091
(0.098)   (0.058)
reg661         -0.122*** -0.210    0.162     -0.110***
(0.039)   (0.203)   (0.120)   (0.041)
reg662         -0.023    -0.289*   0.006     -0.008
(0.028)   (0.147)   (0.087)   (0.032)
reg663         0.023     -0.238*   0.086     0.038
(0.027)   (0.143)   (0.085)   (0.031)
reg664         -0.067*   -0.093    0.113     -0.060
(0.036)   (0.186)   (0.110)   (0.037)
reg665         0.003     -0.483**  0.262**   0.034
(0.036)   (0.188)   (0.112)   (0.048)
reg666         0.015     -0.513**  0.335***  0.050
(0.040)   (0.210)   (0.124)   (0.054)
reg667         -0.007    -0.427**  0.296**   0.022
(0.039)   (0.206)   (0.122)   (0.050)
reg668         -0.176*** 0.314     0.100     -0.191***
(0.046)   (0.242)   (0.143)   (0.049)
smsa           0.134***  0.402***  0.195***  0.111***
(0.020)   (0.105)   (0.062)   (0.030)
smsa66         0.025     0.025     0.047     0.018
(0.019)   (0.106)   (0.063)   (0.021)
south          -0.144*** -0.052    -0.253*** -0.142***
(0.026)   (0.136)   (0.080)   (0.027)
v21                                          -0.057
(0.055)
v22                                          0.007
(0.039)
N              3010      3010      3010      3010
R2             0.302     0.477     0.952     0.302
Adj.R2         0.298     0.474     0.951     0.298
======================================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01

print(OLS4.f_test('(v21=v22=0)'))

<F test: F=0.5424705213230376, p=0.5813675194444683, df_denom=2.99e+03, df_num=2>


#### IV#

from linearmodels.iv import IV2SLS
print(IV2SLS.from_formula(
'lwage ~ 1 + [educ + b_educ ~ nearc4 + black:nearc4] + black +exper + expersq + smsa + smsa66 + south + reg661+ reg662+ reg663+ reg664+ reg665+ reg666+ reg667+ +reg668',
data=df).fit())

                          IV-2SLS Estimation Summary
==============================================================================
Dep. Variable:                  lwage   R-squared:                      0.2435
Estimator:                    IV-2SLS   Adj. R-squared:                 0.2395
No. Observations:                3010   F-statistic:                    842.36
Date:                Mon, Dec 11 2023   P-value (F-stat)                0.0000
Time:                        22:32:42   Distribution:                 chi2(16)
Cov. Estimator:                robust

Parameter Estimates
==============================================================================
Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept      3.8450     0.9519     4.0394     0.0001      1.9794      5.7106
black         -0.2828     0.4998    -0.5658     0.5716     -1.2623      0.6968
exper          0.1059     0.0249     4.2576     0.0000      0.0572      0.1547
expersq       -0.0022     0.0005    -4.5839     0.0000     -0.0032     -0.0013
smsa           0.1112     0.0310     3.5890     0.0003      0.0505      0.1719
smsa66         0.0180     0.0205     0.8775     0.3802     -0.0222      0.0582
south         -0.1425     0.0298    -4.7795     0.0000     -0.2009     -0.0841
reg661        -0.1103     0.0417    -2.6439     0.0082     -0.1922     -0.0285
reg662        -0.0082     0.0338    -0.2418     0.8089     -0.0745      0.0581
reg663         0.0382     0.0334     1.1447     0.2523     -0.0272      0.1037
reg664        -0.0600     0.0397    -1.5126     0.1304     -0.1378      0.0178
reg665         0.0338     0.0518     0.6526     0.5140     -0.0677      0.1352
reg666         0.0499     0.0558     0.8942     0.3712     -0.0595      0.1593
reg667         0.0217     0.0527     0.4117     0.6805     -0.0816      0.1250
reg668        -0.1908     0.0505    -3.7808     0.0002     -0.2898     -0.0919
educ           0.1274     0.0560     2.2741     0.0230      0.0176      0.2371
b_educ         0.0109     0.0398     0.2739     0.7842     -0.0671      0.0889
==============================================================================

Endogenous: educ, b_educ
Instruments: nearc4, black:nearc4
Robust Covariance (Heteroskedastic)
Debiased: False


### Example 6.3 Overidentifying restriction in the wage equation#

df = dataWoo("mroz").dropna(subset=['wage'])
IV2= IV2SLS.from_formula('lwage ~ 1+ [educ ~ motheduc + fatheduc + huseduc] + exper + expersq',
print(IV2)

                          IV-2SLS Estimation Summary
==============================================================================
Dep. Variable:                  lwage   R-squared:                      0.1495
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1435
No. Observations:                 428   F-statistic:                    34.900
Date:                Mon, Dec 11 2023   P-value (F-stat)                0.0000
Time:                        22:32:43   Distribution:                  chi2(3)

Parameter Estimates
==============================================================================
Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept     -0.1869     0.2841    -0.6578     0.5107     -0.7436      0.3699
exper          0.0431     0.0132     3.2643     0.0011      0.0172      0.0690
expersq       -0.0009     0.0004    -2.1880     0.0287     -0.0016  -8.992e-05
educ           0.0804     0.0217     3.7095     0.0002      0.0379      0.1229
==============================================================================

Endogenous: educ
Instruments: motheduc, fatheduc, huseduc
Debiased: False

uhat = IV2.resids
uhat_reg=smf.ols('uhat ~ exper + expersq + motheduc + fatheduc + huseduc', data=df).fit()
print(uhat_reg.summary())

                            OLS Regression Results
==============================================================================
Dep. Variable:                   uhat   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                 -0.009
Method:                 Least Squares   F-statistic:                    0.2205
Date:                Mon, 11 Dec 2023   Prob (F-statistic):              0.954
Time:                        22:32:43   Log-Likelihood:                -432.88
No. Observations:                 428   AIC:                             877.8
Df Residuals:                     422   BIC:                             902.1
Df Model:                           5
Covariance Type:            nonrobust
==============================================================================
coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0086      0.177      0.049      0.961      -0.340       0.357
exper       5.603e-05      0.013      0.004      0.997      -0.026       0.026
expersq    -8.882e-06      0.000     -0.022      0.982      -0.001       0.001
motheduc      -0.0104      0.012     -0.875      0.382      -0.034       0.013
fatheduc       0.0007      0.011      0.059      0.953      -0.022       0.023
huseduc        0.0068      0.011      0.593      0.553      -0.016       0.029
==============================================================================
Omnibus:                       71.851   Durbin-Watson:                   1.938
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              262.548
Skew:                          -0.707   Prob(JB):                     9.74e-58
Kurtosis:                       6.567   Cond. No.                     1.96e+03
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.96e+03. This might indicate that there are
strong multicollinearity or other numerical problems.

LM1 = uhat_reg.nobs * uhat_reg.rsquared
P1 = ss.chi2.sf(LM1, 2)
print(LM1, P1)

1.1150430012569017 0.5726265610619277


#### Hetroskedasticity Robust#

print(IV2SLS.from_formula('lwage ~ 1+ [educ ~ motheduc + fatheduc + huseduc] + exper + expersq',
data=df).fit())

                          IV-2SLS Estimation Summary
==============================================================================
Dep. Variable:                  lwage   R-squared:                      0.1495
Estimator:                    IV-2SLS   Adj. R-squared:                 0.1435
No. Observations:                 428   F-statistic:                    27.835
Date:                Mon, Dec 11 2023   P-value (F-stat)                0.0000
Time:                        22:32:43   Distribution:                  chi2(3)
Cov. Estimator:                robust

Parameter Estimates
==============================================================================
Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
Intercept     -0.1869     0.2999    -0.6232     0.5332     -0.7746      0.4008
exper          0.0431     0.0152     2.8289     0.0047      0.0132      0.0730
expersq       -0.0009     0.0004    -2.0558     0.0398     -0.0017  -4.023e-05
educ           0.0804     0.0216     3.7216     0.0002      0.0381      0.1227
==============================================================================

Endogenous: educ
Instruments: motheduc, fatheduc, huseduc
Robust Covariance (Heteroskedastic)
Debiased: False


#### Hetroskedasticity using LM statistic pp.137#

df=dataWoo('mroz')
edu_reg = smf.ols('educ ~ exper + expersq + motheduc + fatheduc + huseduc', data= df).fit()
df['euhat']= edu_reg.predict()
moth_reg = smf.ols('motheduc ~ exper + expersq + euhat', data=df).fit()
df['rm']= moth_reg.resid
fath_reg = smf.ols('fatheduc ~ exper + expersq + euhat', data=df).fit()
df['rf']= fath_reg.resid

print(summary_col([edu_reg, moth_reg, fath_reg],stars=True,float_format='%0.3f',
info_dict={'N':lambda x: "{0:d}".format(int(x.nobs)),
'R2':lambda x: "{:.3f}".format(x.rsquared),

===========================================
educ    motheduc  fatheduc
-------------------------------------------
Intercept      5.116*** -7.413*** -9.170***
(0.298)  (0.742)   (0.778)
R-squared      0.466    0.430     0.442
R-squared Adj. 0.462    0.427     0.440
euhat                   1.425***  1.534***
(0.061)   (0.064)
exper          0.053**  -0.105*** -0.107***
(0.022)  (0.034)   (0.035)
expersq        -0.001   0.002     0.001
(0.001)  (0.001)   (0.001)
fatheduc       0.101***
(0.021)
huseduc        0.372***
(0.022)
motheduc       0.130***
(0.022)
N              753      753       753
R2             0.466    0.430     0.442
Adj.R2         0.462    0.427     0.440
===========================================
Standard errors in parentheses.
* p<.1, ** p<.05, ***p<.01

df['one']= 1
LMreg = smf.ols('one ~ uhat:rm + uhat:rf + 0', data=df.dropna(subset=['wage'])).fit()
print(LMreg.summary())

                                 OLS Regression Results
=======================================================================================
Dep. Variable:                    one   R-squared (uncentered):                   0.002
Model:                            OLS   Adj. R-squared (uncentered):             -0.002
Method:                 Least Squares   F-statistic:                             0.5082
Date:                Mon, 11 Dec 2023   Prob (F-statistic):                       0.602
Time:                        22:32:43   Log-Likelihood:                         -606.80
No. Observations:                 428   AIC:                                      1218.
Df Residuals:                     426   BIC:                                      1226.
Df Model:                           2
Covariance Type:            nonrobust
==============================================================================
coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
uhat:rm       -0.0270      0.029     -0.933      0.352      -0.084       0.030
uhat:rf       -0.0005      0.031     -0.016      0.987      -0.061       0.060
==============================================================================
Omnibus:                      201.665   Durbin-Watson:                   0.004
Prob(Omnibus):                  0.000   Jarque-Bera (JB):             4236.833
Skew:                          -1.501   Prob(JB):                         0.00
Kurtosis:                      18.119   Cond. No.                         1.47
==============================================================================

Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.

LM2 = LMreg.nobs * LMreg.rsquared
P2 = ss.chi2.sf(LM2, 2)
print(LM2, P2)

1.0187447267734435 0.6008725901243155


### Example 6.4 Testing for neglected nonlinearities in a wage equation#

df=pd.read_csv("nls80.csv")
nls_reg = smf.ols('lwage ~ exper + tenure + married + south + urban + black + educ', data=df).fit()
print(nls_reg.summary())

                            OLS Regression Results
==============================================================================
Dep. Variable:                  lwage   R-squared:                       0.253
Model:                            OLS   Adj. R-squared:                  0.247
Method:                 Least Squares   F-statistic:                     44.75
Date:                Mon, 11 Dec 2023   Prob (F-statistic):           1.16e-54
Time:                        22:32:43   Log-Likelihood:                -381.55
No. Observations:                 935   AIC:                             779.1
Df Residuals:                     927   BIC:                             817.8
Df Model:                           7
Covariance Type:            nonrobust
==============================================================================
coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      5.3955      0.113     47.653      0.000       5.173       5.618
exper          0.0140      0.003      4.409      0.000       0.008       0.020
tenure         0.0117      0.002      4.789      0.000       0.007       0.017
married        0.1994      0.039      5.107      0.000       0.123       0.276
south         -0.0909      0.026     -3.463      0.001      -0.142      -0.039
urban          0.1839      0.027      6.822      0.000       0.131       0.237
black         -0.1883      0.038     -5.000      0.000      -0.262      -0.114
educ           0.0654      0.006     10.468      0.000       0.053       0.078
==============================================================================
Omnibus:                       38.227   Durbin-Watson:                   1.823
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               83.390
Skew:                          -0.224   Prob(JB):                     7.80e-19
Kurtosis:                       4.393   Cond. No.                         187.
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

df['uhat'] = nls_reg.resid
df['wghat2'] = nls_reg.predict()**2
df['wghat3'] = nls_reg.predict()**3
uhat_reg = smf.ols('uhat ~ exper + tenure + married + south + urban + black + educ + wghat2 + wghat3', data = df).fit()
print(uhat_reg.summary())

                            OLS Regression Results
==============================================================================
Dep. Variable:                   uhat   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.009
Method:                 Least Squares   F-statistic:                   0.03616
Date:                Mon, 11 Dec 2023   Prob (F-statistic):               1.00
Time:                        22:32:43   Log-Likelihood:                -381.38
No. Observations:                 935   AIC:                             782.8
Df Residuals:                     925   BIC:                             831.2
Df Model:                           9
Covariance Type:            nonrobust
==============================================================================
coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept   -171.4820    313.246     -0.547      0.584    -786.236     443.272
exper         -0.7623      1.397     -0.546      0.585      -3.504       1.980
tenure        -0.6377      1.169     -0.546      0.585      -2.932       1.656
married      -10.8255     19.840     -0.546      0.585     -49.761      28.110
south          4.9351      9.045      0.546      0.585     -12.816      22.686
urban         -9.9850     18.300     -0.546      0.585     -45.899      25.929
black         10.2263     18.739      0.546      0.585     -26.550      47.003
educ          -3.5516      6.510     -0.546      0.585     -16.328       9.224
wghat2         8.0827     14.746      0.548      0.584     -20.857      37.023
wghat3        -0.4008      0.728     -0.550      0.582      -1.830       1.028
==============================================================================
Omnibus:                       38.085   Durbin-Watson:                   1.822
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               83.483
Skew:                          -0.220   Prob(JB):                     7.44e-19
Kurtosis:                       4.396   Cond. No.                     8.38e+06
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 8.38e+06. This might indicate that there are
strong multicollinearity or other numerical problems.

LM3 = uhat_reg.nobs * uhat_reg.rsquared
P3 = ss.chi2.sf(LM3, 2)
print(LM3, P3)

0.32886895188973253 0.8483733439849407


### Example 6.5 Length of Time on Workers Compensation#

df=dataWoo('injury')
df = df[(df['ky']==1)]
print(smf.ols('ldurat ~ afchnge*highearn', data=df).fit().summary())

                            OLS Regression Results
==============================================================================
Dep. Variable:                 ldurat   R-squared:                       0.021
Model:                            OLS   Adj. R-squared:                  0.020
Method:                 Least Squares   F-statistic:                     39.54
Date:                Mon, 11 Dec 2023   Prob (F-statistic):           2.81e-25
Time:                        22:32:43   Log-Likelihood:                -9322.0
No. Observations:                5626   AIC:                         1.865e+04
Df Residuals:                    5622   BIC:                         1.868e+04
Df Model:                           3
Covariance Type:            nonrobust
====================================================================================
coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            1.1256      0.031     36.621      0.000       1.065       1.186
afchnge              0.0077      0.045      0.171      0.864      -0.080       0.095
highearn             0.2565      0.047      5.406      0.000       0.163       0.349
afchnge:highearn     0.1906      0.069      2.782      0.005       0.056       0.325
==============================================================================
Omnibus:                       29.931   Durbin-Watson:                   1.905
Prob(Omnibus):                  0.000   Jarque-Bera (JB):               41.672
Skew:                           0.037   Prob(JB):                     8.93e-10
Kurtosis:                       3.415   Cond. No.                         6.38
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.