UCLA Academic Technology Services HomeServicesClassesContactJobs
Search

SAS Paper Examples
Missing data: our view of the state of the art by Schafer and Graham

This article appeared in Psychological Methods in 2002, volume 7 and it can be accessed directly  from a UCLA IP address.


Fundamentals

Table 1 on page 153.

options nocenter nodate formchar = '|----|+|---+=|-/<>*';
data schafer;
  input x y y0;
datalines;
169 148 148
126 123 .
132 149 .
160 169 .
105 138 .
116 102 .
125 88 .
112 100 .
133 150 .
94 113 .
109 78 .
109 96 .
106 148 .
176 137 .
128 155 .
131 131 .
130 101 101
145 155 .
136 140 .
146 134 .
111 129 .
97 85  85
134 124 124
153 112 .
118 118 .
137 122 122
101 119 .
103 106 106
78 74 74
151 113 .
;
run;
data table1;
  set schafer;
  y2 = y;
  if x <=140 then y2 = .;
  y3 = y;
  if y <=140 then y3 =.;
run;
proc means data = table1 mean std;
run;
The MEANS Procedure
Variable            Mean         Std Dev
----------------------------------------
x            125.7000000      23.0144632
y            121.9000000      24.6665346
y0           108.5714286      25.1253998
y2           138.2857143      21.0848625
y3           153.4285714       7.5023806
----------------------------------------

Other Methods

Data described on page 156. The data set  is a simulation of one thousand samples. Each sample has sample size of 50. Most of the analyses in the article are based on this data set. We will create it once here and freely use it in the rest of examples. Because of the nature of simulation, the results presented here are slightly different from the results in the article. In the data, y1 corresponds to MCAR, y2 corresponds to MAR and y3 corresponds to MNAR.

%let nsample = 1000;
%let ssize = 50;
%let mu = 125;
%let sigma = 25;
%let corr = .6;
data simdata;
   do sample = 1 to &nsample;
      do i = 1 to &ssize;
          x = rannor(i)*&sigma + &mu;
          y =&corr*x + &sigma*sqrt(1-(&corr)**2)*rannor(sample)+ (1-&corr)*&mu;
	  output;
	  end;
   end;
run;
%let p = .73; /*missing rate*/
data simdata_a;
  set simdata;
  y1 = y;
  y2 = y;
  y3 = y;
  if uniform(2345) < &p then y1 =.;
  if x <= 140 then y2 = .;
  if y <= 140 then y3 = .;
run;

Part 1 of Table 2 on page 156.

ods listing close;
ods output  SimpleStats = part1 PearsonCorr = part2;
proc corr data = simdata_a ;
  by sample;
  var x;
  with y y1 y2 y3;
run;
ods listing;
*for estimates of the mean;
proc sql;
  select mean(mean) as mean_y1, sqrt(sum((mean-125)**2)/&nsample) as rmse
  from part1
  where variable="y1";
  select mean(mean) as mean_y2, sqrt(sum((mean-125)**2)/&nsample) as rmse
  from part1
  where variable="y2";
  select mean(mean) as mean_y3, sqrt(sum((mean-125)**2)/&nsample) as rmse
  from part1
  where variable="y3";
quit;
 mean_y1      rmse
------------------
125.1212  6.851562
 mean_y2      rmse
------------------
143.1475  19.11867
 mean_y3      rmse
------------------
155.3093  30.50782
*for estimates of the standard deviation;
proc sql;
  select mean(stddev) as stddev_y1, sqrt(sum((stddev-25)**2)/&nsample) as rmse
  from part1
  where variable="y1";
  select mean(stddev) as stddev_y2, sqrt(sum((stddev-25)**2)/&nsample) as rmse
  from part1
  where variable="y2";
  select mean(stddev) as stddev_y3, sqrt(sum((stddev-25)**2)/&nsample) as rmse
  from part1
  where variable="y3";
quit;
stddev_y1      rmse
-------------------
 24.37371   5.23205
stddev_y2      rmse
-------------------
 20.76608   6.09129

stddev_y3      rmse
-------------------
 12.11897  13.29513
*for correlation coefficient;
proc sql;
  select mean(x) as corr_y1, sqrt(sum((x -.6)**2)/&nsample) as rmse
  from part2
  where variable="y1";
  select mean(x) as corr_y2, sqrt(sum((x - .6)**2)/&nsample) as rmse
  from part2
  where variable="y2";
  select mean(x) as mean_y3, sqrt(sum((x - .6)**2)/&nsample) as rmse
  from part2
  where variable="y3";
quit;
 corr_y1      rmse
------------------
0.578113  0.209918

 corr_y2      rmse
------------------
 0.33104  0.375956
 mean_y3      rmse
------------------
0.322239  0.384585
*regression coefficient: y on x;
proc reg data = simdata_a outest=y1_on_x noprint;
  by sample;
  model y1 = x;
run;
proc sql;
  select mean(x) as slope_y1_on_x, sqrt(sum((x-.6)**2)/&nsample) as rmse
  from y1_on_x;
quit;

proc reg data = simdata_a outest=y2_on_x noprint;
  by sample;
  model y2 = x;
run;
proc sql;
  select mean(x) as slope_y2_on_x, sqrt(sum((x-.6)**2)/&nsample) as rmse
  from y2_on_x;
quit;

proc reg data = simdata_a outest=y3_on_x noprint;
  by sample;
  model y3 = x;
run;
proc sql;
  select mean(x) as slope_y3_on_x, sqrt(sum((x-.6)**2)/&nsample) as rmse
  from y3_on_x;
quit;
   slope_y1_
        on_x      rmse
----------------------
    0.584609   0.27461

   slope_y2_
        on_x      rmse
----------------------
    0.582127  0.543593

   slope_y3_
        on_x      rmse
----------------------
    0.197364  0.443121
*regression coefficient: x on y;
proc reg data = simdata_a outest=x_on_y1 noprint;
  by sample;
  model x = y1;
run;
quit;
proc sql;
  select mean(y1) as slope_x_on_y1, sqrt(sum((y1-.6)**2)/&nsample) as rmse
  from x_on_y1;
quit;

proc reg data = simdata_a outest=x_on_y2 noprint;
  by sample;
  model x = y2;
run;
quit;
proc sql;
  select mean(y2) as slope_x_on_y2, sqrt(sum((y2-.6)**2)/&nsample) as rmse
  from x_on_y2;
quit;

proc reg data = simdata_a outest=x_on_y3 noprint;
  by sample;
  model x = y3;
run;
quit;
proc sql;
  select mean(y3) as slope_x_on_y3, sqrt(sum((y3-.6)**2)/&nsample) as rmse
  from x_on_y3;
quit;
    slope_x_
       on_y1      rmse
----------------------
      0.6035    0.2583
    slope_x_
       on_y2      rmse
----------------------
    0.205958  0.434805
    slope_x_
       on_y3      rmse
----------------------
    0.580092  0.535735

Part 2 of Table 1 on page 156.

Coverage for mean:

ods listing close;
ods output BasicIntervals = cover;
proc univariate data = simdata_a cibasic;
  class sample ;
  var y1 y2 y3;
run;
ods listing;
data cover1;
  set cover ;
  if parameter = 'Mean' then cmean = (lowercl <= 125 <=uppercl)*100;
  width = uppercl - lowercl;
  if parameter = 'Std Deviation' then cmean = (lowercl <=25<=uppercl)*100;
run;
proc means data = cover1 mean ;
  title "Coverage of mean from y1";
  where varname = 'y1' and parameter='Mean';
  var cmean width;
run;
proc means data = cover1 mean;
  title "Coverage of mean from y2";
  where varname = 'y2' and parameter='Mean';
  var cmean width;
run;
proc means data = cover1 mean;
  title "Coverage of mean from y3";
  where varname = 'y3' and parameter='Mean';
  var cmean width;
run;
Coverage of mean from y1
Variable            Mean
------------------------
cmean         96.2000000
width         29.7482388
------------------------
Coverage of mean from y2

Variable            Mean
------------------------
cmean         20.6000000
width         25.0884499
------------------------
Coverage of mean from y3
Variable            Mean
------------------------
cmean                  0
width         14.7158341
------------------------

Coverage for standard deviation:

proc means data = cover1 mean;
  title "Coverage for std from y1";
  where varname = 'y1' and parameter='Std Deviation';
  var cmean width;
run;
proc means data = cover1 mean;
  title "Coverage for std from y2";
  where varname = 'y2' and parameter='Std Deviation';
  var cmean width;
run;
proc means data = cover1 mean;
  title "Coverage for std from y3";
  where varname = 'y3' and parameter='Std Deviation';
  var cmean width;
run;
Coverage for std from y1
Variable            Mean
------------------------
cmean         95.0000000
width         23.1640982
------------------------
Coverage for std from y2
Variable            Mean
------------------------
cmean         91.4000000
width         19.4889744
------------------------
Coverage for std from y3
Variable            Mean
------------------------
cmean         18.6000000
width         11.4431014
------------------------

Coverage for the correlation coefficient:

data rho_part;
  set part2;
  *for real x<1 the inverse hyperbolic tangent is 
   tanh^(-1)(x) = 1/2*log((1+x)/(1-x));
  rl = tanh(1/2*log((1+x)/(1-x)) - 1.96/sqrt(nx - 3));
  ru = tanh(1/2*log((1+x)/(1-x)) + 1.96/sqrt(nx - 3));
  c = (rl <=.6 <=ru)*100;
  width = ru - rl;
run;
proc means data = rho_part mean;
  title "Coverage for rho from y1";
  where variable = "y1";
  var c width;
run;
proc means data = rho_part mean;
  title "Coverage for rho from y2";
  where variable = "y2";
  var c width;
run;
proc means data = rho_part mean;
  title "Coverage for rho from y3";
  where variable = "y3";
  var c width;
run;
Coverage for rho from y1
Variable            Mean
------------------------
c             95.5000000
width          0.7654581
------------------------
Coverage for rho from y2
Variable            Mean
------------------------
c             81.4000000
width          0.9437288
------------------------
Coverage for rho from y3
Variable            Mean
------------------------
c             80.8000000
width          0.9510662
------------------------

Coverage for the regression coefficient, y on x:

ods listing close;
proc reg data = simdata_a  ;
  by sample;
  model y1 = x /clb;
  ods output  ParameterEstimates = y1_on_x;
run;
quit;

ods listing;
data y1_on_x_b;
  set y1_on_x  (where = (variable="x"));
  c = (lowercl <=.6 <= uppercl)*100;
  width = uppercl - lowercl;
run; 
proc means data = y1_on_x_b mean;
  title "coverage for beta from y1";
  var c width;
run;
coverage for beta from y1
Variable            Mean
------------------------
c             94.6000000
width          1.0994026
------------------------
ods listing close;
proc reg data = simdata_a ;
  by sample;
  model y2 = x /clb;
  ods output  ParameterEstimates = y2_on_x;
run;
quit;
ods listing;
data y2_on_x_b;
  set y2_on_x  (where = (variable="x"));
  c = (lowercl <=.6 <= uppercl)*100;
  width = uppercl - lowercl;
run; 
proc means data = y2_on_x_b mean;
  title "coverage for beta from y2";
  var c width;
run;
coverage for beta from y2
Variable            Mean
------------------------
c             94.6000000
width          2.1740529
------------------------

ods listing close;
proc reg data = simdata_a ;
  by sample;
  model y3 = x /clb;
  ods output  ParameterEstimates = y3_on_x;
run;
quit;
ods listing;
data y3_on_x_b;
  set y3_on_x  (where = (variable="x"));
  c = (lowercl <=.6 <= uppercl)*100;
  width = uppercl - lowercl;
run; 
proc means data = y3_on_x_b mean;
  title "coverage for beta from y3";
  var c width;
run;
coverage for beta from y3
Variable            Mean
------------------------
c             37.4000000
width          0.7306251
------------------------

Coverage for regression coefficient, x on y:

options nonotes;
ods listing close;
proc reg data = simdata_a  ;
  by sample;
  model x = y1 /clb;
  ods output  ParameterEstimates = x_on_y1;
run;
quit;
ods listing;
data x_on_y1_b;
  set x_on_y1  (where = (variable="y1"));
  c = (lowercl <=.6 <= uppercl)*100;
  width = uppercl - lowercl;
run; 
proc means data = x_on_y1_b mean;
  title "coverage for beta from y1";
  var c width;
run;
coverage for beta from y1
Variable            Mean
------------------------
c             96.0000000
width          1.1240362
------------------------
ods listing close;
proc reg data = simdata_a  ;
  by sample;
  model x = y2 /clb;
  ods output  ParameterEstimates = x_on_y2;
run;
quit;
ods listing;
data x_on_y2_b;
  set x_on_y2  (where = (variable="y2"));
  c = (lowercl <=.6 <= uppercl)*100;
  width = uppercl - lowercl;
run; 
proc means data = x_on_y2_b mean;
  title "coverage for beta from y2";
  var c width;
run;
coverage for beta from y2
Variable            Mean
------------------------
c             40.6000000
width          0.7435461
------------------------
ods listing close;
proc reg data = simdata_a  ;
  by sample;
  model x = y3 /clb;
  ods output  ParameterEstimates = x_on_y3;
run;
quit;
ods listing;
data x_on_y3_b;
  set x_on_y3  (where = (variable="y3"));
  c = (lowercl <=.6 <= uppercl)*100;
  width = uppercl - lowercl;
run; 
proc means data = x_on_y3_b mean;
  title "coverage for beta from y3";
  var c width;
run;
coverage for beta from y3
Variable            Mean
------------------------
c             95.1000000
width          2.2684332
------------------------

Averaging the available items on page 158. The first part of the code creates a macro program, called %msim. We use this macro program to generate simulated data. After the data set is generated, it is modified based on the criterion specified in the article, such as missing data rate for each of the variables. Then we call proc reg to perform regression analysis on each of the samples. Last, we calculate the mean estimates of the beta coefficient and the coverage. Notice that in this simulation run, the coverage is only about 73%, much lower than 90.3% in the article.

%macro msim(seed, ssize, nrep, outdata);
proc iml;
* correlation matrix;
 cor={1  .5  .2  .2  .5,  
     .5   1  .2  .2  .5, 
     .2  .2   1  .5  .1,
     .2  .2  .5   1  .1,
     .5  .5  .1  .1   1} ;
* assume each variable is standardized;
* compute Choleski root for transformation;
  t=root(cor);
 * Initialize data vector with random number seed;
  dat=J(&ssize,5,&seed);
  d = rannor(dat);
* impose the desired covariance structure;
  y = d*t;
  sample = J(&ssize, 1, 1);
  all = y ||sample;
  create &outdata from all [colname={y1 y2 y3 y4 x sample}] ;
  append from all;
  %do i = 2 %to &nrep;
    	dat=J(&ssize, 5 ,&seed*&i);
	    d = rannor(dat);
	    y = d*t;
	 sample = J(&ssize, 1, &i);
	 all = y ||sample;
     	 append from all;
  %end;
	quit;
%mend;
%msim(23455, 100, 1000, asim);
data areg ;
  set asim;
  by sample;
  retain seed1 seed2 seed3 seed4;
  if first.sample then do;
    seed1 = 1;
	seed2 = 3;
	seed3 = 5;
	seed4 = 7;
	end;
  if uniform(seed1) <= 0.3 then y1 = .;
  if uniform(seed2) <= 0.3 then y2 = .;
  if uniform(seed3) <= .05 then y3 = .;
  if uniform(seed4) <= .05 then y4 = .;
  ybar = sum(of y:)/4;
  miss = 0;
  array ally(4) y1 - y4;
  do i = 1 to 4;
    if ally(i) = . then miss = miss + 1;
  end;
  if miss > 2 then ybar = .;
run;

ods listing close;
proc reg data = areg ;
  by sample;
  model ybar = x /clb;
    ods output  ParameterEstimates = page158;
run;
quit;
ods listing;

data page158a;
  set page158;
  c = (lowercl <=.3 <=uppercl);
run;

options nolabel;
proc means data = page158a mean;
  where variable = 'X';
  var estimate c;
run;
Variable            Mean
------------------------
Estimate       0.2263559
c              0.7330000
------------------------

Single Imputation

Figure 3 on page 160 based on data set below.

data schafer;
  input x y y0;
datalines;
169 148 148
126 123 .
132 149 .
160 169 .
105 138 .
116 102 .
125 88 .
112 100 .
133 150 .
94 113 .
109 78 .
109 96 .
106 148 .
176 137 .
128 155 .
131 131 .
130 101 101
145 155 .
136 140 .
146 134 .
111 129 .
97 85  85
134 124 124
153 112 .
118 118 .
137 122 122
101 119 .
103 106 106
78 74 74
151 113 .
;
run;
data table1;
  set schafer;
  y2 = y; ***MAR;
  if x <=140 then y2 = .;
  y3 = y; ***NMAR;
  if y <=140 then y3 =.;
run;

Part (a): Mean Substitution

proc sql;
  select mean(y2)into :ymean
  from table1;
quit;
data mean_sub;
  set table1;
  ymean = y2;
  if ymean = . then ymean = &ymean;
run;
symbol v=circle ;
axis1 order = (80 to 180 by 20) minor = none label=(a=90 'y');
axis2 order = (80 to 180 by 20) minor = none;
proc gplot data = mean_sub;
  plot ymean*x /vaxis= axis1 haxis=axis2; 
run;
quit;
 

Part (b) Hot Deck:

*hotdeck for univariate case: replacing each missing
 value by a random draw from the observed values;
data hotdeck_sample;
  set table1 (where=(y2~=.));
run;
proc surveyselect data = hotdeck_sample method = URS rep = 1 
                         sampsize = 30 seed = 12345 out = hotdecked;
  id _all_;
run;
data hota;
  set hotdecked;
  do i = 1 to numberhits;
   a = uniform(0);
    output;
  end;
  rename y2 = y2_hot;
  keep a y2;
run;
proc sort data = hota;
  by a;
run;
data hotall;
  merge table1 hota;
  if y2 ~=. then y2_hot=y2;
run;
symbol v=circle ;
proc gplot data = hotall;
  plot y2_hot*x /vaxis=axis1 haxis=axis2;
run;
quit;
 

Part (c) Conditional Mean:

proc reg data = table1;
  model y2 = x;
  output out = con_sub p=p;
run;
quit;
data con_suba;
  set con_sub;
  ycon = y2;
  if ycon = . then ycon = p;
run;
proc gplot data = con_suba;
  plot ycon*x /vaxis=axis1 haxis=axis2;
run;
quit;

Part (d) Predictive Distribution:

proc reg data = table1;
  model y2 = x;
  output out = pred r=r p = p;
run;
quit;data predictive;
  set pred (where=(r~=.));
run;
proc surveyselect data = predictive method = URS rep = 1 
                         sampsize = 30 seed = 12345 out = preda;
  id _all_;
run;
data predb;
  set preda;
  do i = 1 to numberhits;
   a = uniform(0);
    output;
  end;
  rename r = rdraw;
  keep a r;
run;
proc sort data = predb;
  by a;
run;
data predall;
  merge predb pred;
  ypred = y2;
  if y2 =. then ypred= p + rdraw;
run;
proc gplot data = predall;
  plot ypred*x /vaxis=axis1 haxis=axis2;
run;
quit;

Table 3 on page 161 using the simulated data set created earlier.

We first created two macro programs that will produce the top and bottom part of Table 3 for a given imputed data set. 

%macro part1(data, y1, y2, y3);
proc sql;
  create table mytable as
  select mean(&y1) as mcar_mu, std(&y1) as mcar_std,
         mean(&y2) as mar_mu,  std(&y2) as mar_std,
         mean(&y3) as mnar_mu, std(&y3) as mnar_std
  from &data
  group by sample;
quit;
*mu part;
proc sql;
  select mean(mcar_mu) as mcar_mu, sqrt(sum((mcar_mu - 125)**2)/&nsample) as rmse_mcar ,
         mean(mar_mu) as mar_mu, sqrt(sum((mar_mu - 125)**2)/&nsample) as rmse_car ,
	 mean(mnar_mu) as mnar_mu, sqrt(sum((mnar_mu - 125)**2)/&nsample) as rmse_mnar 
  from mytable;
quit;
*sigma part;
proc sql;
  select mean(mcar_std) as mcar_std, 
         sqrt(sum((mcar_std - 25)**2)/&nsample) as rmse_mcar_std ,
         mean(mar_std) as car_std, 
         sqrt(sum((mar_std - 25)**2)/&nsample) as rmse_car_std ,
	 mean(mnar_std) as mnar_std, 
         sqrt(sum((mnar_std - 25)**2)/&nsample) as rmse_mnar_std 
  from mytable;
quit;
*correlation part;
proc corr data = &data noprint nosimple outp=corr_part (where =(_type_='CORR'));
  var x;
  with &y1 &y2 &Y3;
  by sample;
run;
proc sql;
  select mean(x) as corr_&y1, sqrt(sum((x -.6)**2)/&nsample) as rmse
  from corr_part
  where _name_ = "&&y1";
  select mean(x) as corr_&y2, sqrt(sum((x - .6)**2)/&nsample) as rmse
  from corr_part
  where _name_="&&y2";
  select mean(x) as corr_&y3, sqrt(sum((x - .6)**2)/&nsample) as rmse
  from corr_part
  where _name_="&&y3";
quit;
*beta part;
options nonotes;
*y on x;
proc reg data = &data outest=&y1._on_x noprint;
  by sample;
  model &y1 = x;
run;
proc sql;
  select mean(x) as &y1._on_x, sqrt(sum((x-.6)**2)/&nsample) as rmse
  from &y1._on_x;
quit;
proc reg data = &data outest=&y2._on_x noprint;
  by sample;
  model &y2 = x;
run;
proc sql;
  select mean(x) as &y2._on_x, sqrt(sum((x-.6)**2)/&nsample) as rmse
  from &y2._on_x;
quit;
proc reg data = &data outest=&y3._on_x noprint;
  by sample;
  model &y3 = x;
run;
proc sql;
  select mean(x) as &y3._on_x, sqrt(sum((x-.6)**2)/&nsample) as rmse
  from &y3._on_x;
quit;
*x on y;
proc reg data = &data outest=x_on_&y1 noprint;
  by sample;
  model x = &y1;
run;
quit;
proc sql;
  select mean(&y1) as x_on_&y1, sqrt(sum((&y1-.6)**2)/&nsample) as rmse
  from x_on_&y1;
quit;
proc reg data = &data outest=x_on_&y2 noprint;
  by sample;
  model x = &y2;
run;
quit;
proc sql;
  select mean(&y2) as x_on_&y2, sqrt(sum((&y2-.6)**2)/&nsample) as rmse
  from x_on_&y2;
quit;
proc reg data = &data outest=x_on_&y3 noprint;
  by sample;
  model x = &y3;
run;
quit;
proc sql;
  select mean(&y3) as x_on_&y3, sqrt(sum((&y3-.6)**2)/&nsample) as rmse
  from x_on_&y3;
quit;
options notes;
%mend;
%macro part2(data, y1, y2, y3);
ods listing close;
ods output BasicIntervals = cover;
proc univariate data = &data cibasic;
  class sample ;
  var &y1 &y2 &y3;
run;
ods listing;
data cover1;
  set cover ;
  if parameter = 'Mean' then cmean = (lowercl <= 125 <=uppercl)*100;
  width = uppercl - lowercl;
  if parameter = 'Std Deviation' then cmean = (lowercl <=25<=uppercl)*100;
run;
* mu part;
proc means data = cover1 mean ;
  title "Coverage of mean from &y1";
  where varname = "&y1" and parameter='Mean';
  var cmean width;
run;
proc means data = cover1 mean;
  title "Coverage of mean from &y2";
  where varname = "&y2" and parameter='Mean';
  var cmean width;
run;
proc means data = cover1 mean;
  title "Coverage of mean from y3";
  where varname = "&y3" and parameter='Mean';
  var cmean width;
run;
* sigma part;
proc means data = cover1 mean;
  title "Coverage for std from &y1";
  where varname = "&y1" and parameter='Std Deviation';
  var cmean width;
run;
proc means data = cover1 mean;
  title "Coverage for std from &y2";
  where varname = "&y2" and parameter='Std Deviation';
  var cmean width;
run;
proc means data = cover1 mean;
  title "Coverage for std from &y3";
  where varname = "&y3" and parameter='Std Deviation';
  var cmean width;
run;
* on correlation;
proc corr data = &data noprint nosimple outp=corr_part (where =(_type_='CORR'));
  var x;
  with &y1 &y2 &Y3;
  by sample;
run;
data rho_part;
  set corr_part;
  *for real x<1 the inverse hyperbolic tangent is 
   tanh^(-1)(x) = 1/2*log((1+x)/(1-x));
  rl = tanh(1/2*log((1+x)/(1-x)) - 1.96/sqrt(&ssize - 3));
  ru = tanh(1/2*log((1+x)/(1-x)) + 1.96/sqrt(&ssize - 3));
  c = (rl <=.6 <=ru)*100;
  width = ru - rl;
run;
proc means data = rho_part mean;
  where _name_ = "&y1";
  title "coverate for rho from &y1";
  var c width;
run;
proc means data = rho_part mean;
  where _name_ = "&y2";
  title "coverage for rho from &y2";
  var c width;
run;
proc means data = rho_part mean;
  where _name_ = "&y3";
  title "coverage for rho from &y3";
  var c width;
run;
*on betas: y on x;
ods listing close;
proc reg data = &data  ;
  by sample;
  model &y1 = x /clb;
  ods output  ParameterEstimates = &y1._on_x;
run;
quit;
ods listing;
data &y1._on_x;
  set &y1._on_x  (where = (variable="x"));
  c = (lowercl <=.6 <= uppercl)*100;
  width = uppercl - lowercl;
run; 
proc means data = &y1._on_x mean;
  title "coverage for beta on x from &y1";
  var c width;
run;
ods listing close;
proc reg data = &data  ;
  by sample;
  model &y2 = x /clb;
  ods output  ParameterEstimates = &y2._on_x;
run;
quit;
ods listing;
data &y2._on_x;
  set &y2._on_x  (where = (variable="x"));
  c = (lowercl <=.6 <= uppercl)*100;
  width = uppercl - lowercl;
run; 
proc means data = &y2._on_x mean;
  title "coverage for beta on x from &y2";
  var c width;
run;
ods listing close;
proc reg data = &data  ;
  by sample;
  model &y3 = x /clb;
  ods output  ParameterEstimates = &y3._on_x;
run;
quit;
ods listing;
data &y3._on_x;
  set &y3._on_x  (where = (variable="x"));
  c = (lowercl <=.6 <= uppercl)*100;
  width = uppercl - lowercl;
run; 
proc means data = &y3._on_x mean;
  title "coverage for beta on x from &y3";
  var c width;
run;
* on betas: x on y;
options nonotes;
ods listing close;
proc reg data = &data ;
  by sample;
  model x = &y1 /clb;
  ods output  ParameterEstimates = x_on_&y1;
run;
quit;
ods listing;
data x_on_&y1;
  set x_on_&y1  (where = (variable="&y1"));
  c = (lowercl <=.6 <= uppercl)*100;
  width = uppercl - lowercl;
run; 
proc means data = x_on_&y1 mean;
  title "coverage for beta on y from &y1";
  var c width;
run;
ods listing close;
proc reg data = &data ;
  by sample;
  model x = &y2 /clb;
  ods output  ParameterEstimates = x_on_&y2;
run;
quit;
ods listing;
data x_on_&y2;
  set x_on_&y2  (where = (variable="&y2"));
  c = (lowercl <=.6 <= uppercl)*100;
  width = uppercl - lowercl;
run; 
proc means data = x_on_&y2 mean;
title "coverage for beta on y from &y2";
  var c width;
run;
ods listing close;
proc reg data = &data ;
  by sample;
  model x = &y3 /clb;
  ods output  ParameterEstimates = x_on_&y3;
run;
quit;
ods listing;
data x_on_&y3;
  set x_on_&y3  (where = (variable="&y3"));
  c = (lowercl <=.6 <= uppercl)*100;
  width = uppercl - lowercl;
run; 
proc means data = x_on_&y3 mean;
title "coverage for beta on y from &y3";
  var c width;
run;
%mend;

Method 1: Mean Substitution

proc sql; 
  create table ms_a as
  select sample, x,
         sum(y1, (y1=.)*mean(y1)) as y1_ms,
		 sum(y2, (y2=.)*mean(y2)) as y2_ms ,
		 sum(y3, (y3=.)*mean(y3)) as y3_ms
  from simdata_a
  group by sample;
quit; 
title;
options nocenter nodate nonumber formchar = '|----|+|---+=|-/<>*' formdlim=" ";
%part1(ms_a, y1_ms, y2_ms, y3_ms);
 mcar_mu  rmse_mcar    mar_mu  rmse_car   mnar_mu  rmse_mnar
------------------------------------------------------------
125.1212   6.851562  143.1475  19.11867  155.3093   30.50782

            rmse_mcar_                                      rmse_mnar_
mcar_std           std   car_std  rmse_car_std  mnar_std           std
----------------------------------------------------------------------
12.24263      13.11604  10.50443      14.72017  6.112032       18.9737

corr_y1_ms      rmse
--------------------
   0.29491  0.330813

corr_y2_ms      rmse
--------------------
  0.085841  0.519119

corr_y3_ms      rmse
--------------------
  0.141434  0.474257

y1_ms_on_x      rmse
--------------------
   0.15312  0.455417

y2_ms_on_x      rmse
--------------------
  0.037815  0.563228

y3_ms_on_x      rmse
--------------------
  0.037612   0.56357

x_on_y1_ms      rmse
--------------------
    0.6035    0.2583

x_on_y2_ms      rmse
--------------------
  0.205958  0.434805

x_on_y3_ms      rmse
--------------------
  0.580092  0.535735
%part2(ms_a, y1_ms, y2_ms, y3_ms);
Coverage of mean from y1_ms
Variable            Mean
------------------------
cmean         38.0000000
width          6.9586356
------------------------
Coverage of mean from y2_ms
Variable            Mean
------------------------
cmean          0.5000000
width          5.9706525
------------------------
Coverage of mean from y3
Variable            Mean
------------------------
cmean                  0
width          3.4740407
------------------------
Coverage for std from y1_ms
Variable            Mean
------------------------
cmean          0.4000000
width          5.0292657
------------------------
Coverage for std from y2_ms
Variable            Mean
------------------------
cmean                  0
width          4.3152134
------------------------
Coverage for std from y3_ms
Variable            Mean
------------------------
cmean                  0
width          2.5108188
------------------------
coverate for rho from y1_ms
Variable            Mean
------------------------
c             23.6000000
width          0.5029756
------------------------
coverage for rho from y2_ms
Variable            Mean
------------------------
c                      0
width          0.5502732
------------------------
coverage for rho from y3_ms
Variable            Mean
------------------------
c              2.1000000
width          0.5388044
------------------------
coverage for beta on x from y1_ms
Variable            Mean
------------------------
c              0.7000000
width          0.2697042
------------------------
coverage for beta on x from y2_ms
Variable            Mean
------------------------
c                      0
width          0.2437750
------------------------

coverage for beta on x from y3_ms
Variable            Mean
------------------------
c                      0
width          0.1400849
------------------------
coverage for beta on y from y1_ms
Variable            Mean
------------------------
c             98.0000000
width          1.2041194
------------------------
coverage for beta on y from y2_ms
Variable            Mean
------------------------
c             92.4000000
width          1.4566565
------------------------
coverage for beta on y from y3_ms
Variable            Mean
------------------------
c             98.2000000
width          2.5620881
------------------------

Method 2: Hotdeck

data hotdecked1;
  set simdata_a (where=(y1~=.));
  keep sample y1;
run;
proc surveyselect data = hotdecked1 method = URS rep = 1 
                  sampsize = 50 seed = 12345 out = hotdecked_y1;
  strata sample;
  id _all_;
run;
data hotdecked2;
  set simdata_a (where=(y2~=.));
  keep sample y2;
run;
proc surveyselect data = hotdecked2 method = URS rep = 1 
                  sampsize = 50 seed = 12345 out = hotdecked_y2;
  strata sample;
  id _all_;
run;
data hotdecked3;
  set simdata_a (where=(y3~=.));
  keep sample y3;
run;
proc surveyselect data = hotdecked3 method = URS rep = 1 
                  sampsize = 50 seed = 12345 out = hotdecked_y3;
  strata sample;
  id _all_;
run;

data hot_y1;
  set hotdecked_y1;
  do i = 1 to numberhits;
   a = uniform(0);
    output;
  end;
  rename y1 = y1_hot;
  keep a y1 sample;
run;
data hot_y2;
  set hotdecked_y2;
  do i = 1 to numberhits;
   a = uniform(0);
    output;
  end;
  rename y2 = y2_hot;
  keep a y2 sample;
run;
data hot_y3;
  set hotdecked_y3;
  do i = 1 to numberhits;
   a = uniform(0);
    output;
  end;
  rename y3 = y3_hot;
  keep a y3 sample;
run;
proc sort data = hot_y1;
  by sample a;
run;
proc sort data = hot_y2;
  by sample a;
run;
proc sort data = hot_y3;
  by sample a;
run;
data hotall;
  merge simdata_a hot_y1 hot_y2 hot_y3;
  by sample;
  if y1 ~=. then y1_hot = y1;
  if y2 ~=. then y2_hot = y2;
  if y3 ~=. then y3_hot = y3;
run;
options nocenter nodate nonumber formchar = '|----|+|---+=|-/<>*' formdlim=" ";
%part1(hotall, y1_hot, y2_hot, y3_hot);
 mcar_mu  rmse_mcar    mar_mu  rmse_car   mnar_mu  rmse_mnar
------------------------------------------------------------
125.1359   7.476645  143.2299   19.3382  155.2633   30.49208
            rmse_mcar_                                      rmse_mnar_
mcar_std           std   car_std  rmse_car_std  mnar_std           std
----------------------------------------------------------------------
23.40481      5.541003  19.87496      6.826885  11.60998      13.78752

corr_y1_hot      rmse
---------------------
   0.160086  0.463393

corr_y2_hot      rmse
---------------------
   0.045763  0.565438

corr_y3_hot      rmse
---------------------
   0.085243  0.531706

y1_hot_on_x      rmse
---------------------
   0.155293  0.468177

y2_hot_on_x      rmse
---------------------
   0.038588  0.569104

y3_hot_on_x      rmse
---------------------
   0.041098  0.562834

x_on_y1_hot      rmse
---------------------
   0.174193  0.460122

x_on_y2_hot      rmse
---------------------
   0.056829  0.564269

x_on_y3_hot      rmse
---------------------
   0.189971  0.520863
%part2(hotall, y1_hot, y2_hot, y3_hot);
Coverage of mean from y1_hot
Variable            Mean
------------------------
cmean         62.1000000
width         13.3031472
------------------------
Coverage of mean from y2_hot
Variable            Mean
------------------------
cmean          2.0000000
width         11.2968036
------------------------
Coverage of mean from y3
Variable            Mean
------------------------
cmean                  0
width          6.5990411
------------------------
Coverage for std from y1_hot
Variable            Mean
------------------------
cmean         62.5000000
width          9.6146810
------------------------
Coverage for std from y2_hot
Variable            Mean
------------------------
cmean         41.9000000
width          8.1646216
------------------------
Coverage for std from y3_hot
Variable            Mean
------------------------
cmean          1.5000000
width          4.7693733
------------------------
coverate for rho from y1_hot
Variable            Mean
------------------------
c              5.1000000
width          0.5324571
------------------------
coverage for rho from y2_hot
Variable            Mean
------------------------
c              0.1000000
width          0.5491648
------------------------
coverage for rho from y3_hot
Variable            Mean
------------------------
c              1.1000000
width          0.5437949
------------------------
coverage for beta on x from y1_hot
Variable            Mean
------------------------
c             15.1000000
width          0.5342932
------------------------
coverage for beta on x from y2_hot
Variable            Mean
------------------------
c              0.5000000
width          0.4627474
------------------------
coverage for beta on x from y3_hot
Variable            Mean
------------------------
c                      0
width          0.2684218
------------------------
coverage for beta on y from y1_hot
Variable            Mean
------------------------
c             24.3000000
width          0.6350839
------------------------
coverage for beta on y from y2_hot
Variable            Mean
------------------------
c             16.8000000
width          0.7623825
------------------------
coverage for beta on y from y3_hot
Variable            Mean
------------------------
c             73.7000000
width          1.3352075
------------------------

Method 3: Conditional Mean

ods listing close;
proc reg data = simdata_a;
  by sample;
  model y1 = x;
  output out = cm_y1 p=y1_cm;
run;
quit;
proc reg data = simdata_a;
  by sample;
  model y2 = x;
  output out = cm_y2 p=y2_cm;
run;
quit;
proc reg data = simdata_a;
  by sample;
  model y3 = x;
  output out = cm_y3 p=y3_cm;
run;
quit;
data cm_all;
  merge simdata_a cm_y1 cm_y2 cm_y3;
  by sample;
  if y1~=. then y1_cm = y1;
  if y2~=. then y2_cm = y2;
  if y3~=. then y3_cm = y3;
run;
options nocenter nodate nonumber formchar = '|----|+|---+=|-/<>*' formdlim=" ";
ods listing;
title;
%part1(cm_all, y1_cm, y2_cm, y3_cm);
 mcar_mu  rmse_mcar    mar_mu  rmse_car   mnar_mu  rmse_mnar
------------------------------------------------------------
124.9894    6.09717  125.3847   17.4448  151.5307   26.88287

            rmse_mcar_                                      rmse_mnar_
mcar_std           std   car_std  rmse_car_std  mnar_std           std
----------------------------------------------------------------------
18.02616      8.882723  19.73373      10.81517  8.227147      17.09959

corr_y1_cm      rmse
--------------------
  0.778312  0.282195

corr_y2_cm      rmse
--------------------
  0.633492   0.46452

corr_y3_cm      rmse
--------------------
  0.518901  0.418478

y1_cm_on_x      rmse
--------------------
  0.584609   0.27461

y2_cm_on_x      rmse
--------------------
  0.582127  0.543593

y3_cm_on_x      rmse
--------------------
  0.197364  0.443121

x_on_y1_cm      rmse
--------------------
  1.121899   0.65912

x_on_y2_cm      rmse
--------------------
  0.804306  0.764183

x_on_y3_cm      rmse
--------------------
  1.554164  1.735976
%part2(cm_all, y1_cm, y2_cm, y3_cm);
Coverage of mean from y1_cm
Variable            Mean
------------------------
cmean         60.5000000
width         10.2459539
------------------------
Coverage of mean from y2_cm
Variable            Mean
------------------------
cmean         27.6000000
width         11.2165253
------------------------
Coverage of mean from y3
Variable            Mean
------------------------
cmean          0.2000000
width          4.6762584
------------------------
Coverage for std from y1_cm
Variable            Mean
------------------------
cmean         30.3000000
width          7.4051333
------------------------
Coverage for std from y2_cm
Variable            Mean
------------------------
cmean         30.0000000
width          8.1066015
------------------------
Coverage for std from y3_cm
Variable            Mean
------------------------
cmean          0.6000000
width          3.3797065
------------------------
coverate for rho from y1_cm
Variable            Mean
------------------------
c             22.5000000
width          0.2008540
------------------------
coverage for rho from y2_cm
Variable            Mean
------------------------
c             18.5000000
width          0.2205123
------------------------
coverage for rho from y3_cm
Variable            Mean
------------------------
c             37.1000000
width          0.3204989
------------------------
coverage for beta on x from y1_cm
Variable            Mean
------------------------
c             37.5000000
width          0.2203735
------------------------
coverage for beta on x from y2_cm
Variable            Mean
------------------------
c             19.3000000
width          0.2202171
------------------------
coverage for beta on x from y3_cm
Variable            Mean
------------------------
c              3.0000000
width          0.1282304
------------------------
coverage for beta on y from y1_cm
Variable            Mean
------------------------
c             11.1000000
width          0.5264349
------------------------
coverage for beta on y from y2_cm
Variable            Mean
------------------------
c             17.4000000
width          0.5944075
------------------------
coverage for beta on y from y3_cm
Variable            Mean
------------------------
c             20.3000000
width          1.5733904
------------------------

Method 4: Predictive Distribution

ods listing close;
proc reg data = simdata_a;
  by sample;
  model y1 = x;
  output out = pd_y1 r=r_y1 p = p_y1;
run;
quit;
proc surveyselect data = pd_y1_a (where=(r_y1~=.) method = URS rep = 1 
                         sampsize = 50 seed = 12345 out = pd_y1a;
  strata sample;
  id _all_;
run;
data pd_y1a; /*expand the data*/
  set pd_y1a;
  do i = 1 to numberhits;
   a = uniform(0);
    output;
  end;
  keep sample a r_y1;
run;
proc sort data = pd_y1a;
  by sample a;
run;
proc reg data = simdata_a;
  by sample;
  model y2 = x;
  output out = pd_y2 r=r_y2 p = p_y2;
run;
quit;
data pd_y2_a;
  set pd_y2 (where=(r_y2~=.));
  keep sample r_y2;
run;
proc surveyselect data = pd_y2_a method = URS rep = 1 
                         sampsize = 50 seed = 12345 out = pd_y2a;
  strata sample;
  id _all_;
run;
data pd_y2a;
  set pd_y2a;
  do i = 1 to numberhits;
   a = uniform(0);
    output;
  end;
  keep sample a r_y2;
run;
proc sort data = pd_y2a;
  by sample a;
run;
proc reg data = simdata_a;
  by sample;
  model y3 = x;
  output out = pd_y3 r=r_y3 p = p_y3;
run;
quit;
data pd_y3_a;
  set pd_y3 (where=(r_y3~=.));
  keep sample r_y3;
run;
proc surveyselect data = pd_y3_a method = URS rep = 1 
                         sampsize = 50 seed = 12345 out = pd_y3a;
  strata sample;
  id _all_;
run;
data pd_y3a;
  set pd_y3a;
  do i = 1 to numberhits;
   a = uniform(0);
    output;
  end;
  keep sample a r_y3;
run;
proc sort data = pd_y3a;
  by sample a;
run;
data pdall;
  merge pd_y1 pd_y1a pd_y2 pd_y2a pd_y3 pd_y3a;
  by sample;
  y1_pd = y1;
  y2_pd = y2;
  y3_pd = y3;
  if y1 =. then y1_pd= p_y1 + r_y1;
  if y2 =. then y2_pd= p_y2 + r_y2;
  if y3 =. then y3_pd= p_y3 + r_y3;
  keep sample y1_pd y2_pd y3_pd x y1 y2 y3;
run;
ods listing;
title;
%part1(pdall, y1_pd, y2_pd, y3_pd);
 mcar_mu  rmse_mcar    mar_mu  rmse_car   mnar_mu  rmse_mnar
------------------------------------------------------------
125.0335   6.452622  125.4237   17.5308  151.5479   26.93288

            rmse_mcar_                                      rmse_mnar_
mcar_std           std   car_std  rmse_car_std  mnar_std           std
----------------------------------------------------------------------
23.87999      5.650053  25.40816      8.559005  12.24355       13.2747

corr_y1_pd      rmse
--------------------
  0.592785   0.22216

corr_y2_pd      rmse
--------------------
   0.51098  0.397096

corr_y3_pd      rmse
--------------------
   0.37345  0.392657

y1_pd_on_x      rmse
--------------------
  0.582993  0.288618

y2_pd_on_x      rmse
--------------------
  0.582361  0.548709

y3_pd_on_x      rmse
--------------------
  0.199443  0.444836

x_on_y1_pd      rmse
--------------------
    0.6331  0.261128

x_on_y2_pd      rmse
--------------------
  0.492133  0.414963

x_on_y3_pd      rmse
--------------------
   0.76266  0.736598
%part2(pdall, y1_pd, y2_pd, y3_pd);
Coverage of mean from y1_pd
Variable            Mean
------------------------
cmean         71.2000000
width         13.5732371
------------------------
Coverage of mean from y2_pd
Variable            Mean
------------------------
cmean         33.2000000
width         14.4418396
------------------------
Coverage of mean from y3
Variable            Mean
------------------------
cmean          0.1000000
width          6.9591564
------------------------
Coverage for std from y1_pd
Variable            Mean
------------------------
cmean         63.4000000
width          9.8098851
------------------------
Coverage for std from y2_pd
Variable            Mean
------------------------
cmean         48.2000000
width         10.4376565
------------------------
Coverage for std from y3_pd
Variable            Mean
------------------------
cmean          3.3000000
width          5.0296421
------------------------
coverate for rho from y1_pd
Variable            Mean
------------------------
c             62.3000000
width          0.3423744
------------------------
coverage for rho from y2_pd
Variable            Mean
------------------------
c             39.0000000
width          0.3355097
------------------------
coverage for rho from y3_pd
Variable            Mean
------------------------
c             45.1000000
width          0.4280373
------------------------
coverage for beta on x from y1_pd
Variable            Mean
------------------------
c             58.1000000
width          0.4187768
------------------------
coverage for beta on x from y2_pd
Variable            Mean
------------------------
c             33.3000000
width          0.4158385
------------------------
coverage for beta on x from y3_pd
Variable            Mean
------------------------
c              7.6000000
width          0.2420276
------------------------
coverage for beta on y from y1_pd
Variable            Mean
------------------------
c             69.4000000
width          0.4910793
------------------------
coverage for beta on y from y2_pd
Variable            Mean
------------------------
c             59.6000000
width          0.4912461
------------------------
coverage for beta on y from y3_pd
Variable            Mean
------------------------
c             53.7000000
width          1.1271793
------------------------

Table 4 on page 164: Performance of ML


How to cite this page

Report an error on this page

UCLA Researchers are invited to our Statistical Consulting Services
We recommend others to our list of Other Resources for Statistical Computing Help
These pages are Copyrighted (c) by UCLA Academic Technology Services


The content of this web site should not be construed as an endorsement of any particular web site, book, or software product by the University of California