UCLA Academic Technology Services HomeServicesClassesContactJobs
Search

SAS Code Fragments
Merging the NSAF data files

The code below illustrates how to combine the NSAF data sets. 

You can download the NSAF data files from http://anfdata.urban.org/nsaf/cpuf/1999nsafdata.cfm .  Note that you need to register to use these data files.  You can view the documentation for these data at http://www.urban.org/uploadedPDF/1999_Methodology_11.pdf .

Some of the data files have variables with the same name but different information in them.  The macro renames the variables so that they do not overwrite one another during the merge.

options nocenter nodate formchar="|----|+|---+=|-/\<>" ;

%macro rname(libname, data, outdata, suffix);
  proc sql noprint;
  select name into :varlist separated by " "
  from sashelp.vcolumn
  where libname=upcase("&libname") and memname=upcase("&data");
quit;
  %let k=1;
  %local change ;
   %let m = %scan(&varlist, &k);
     %do %while(&m NE);
     %let k = %eval(&k + 1);
     %let change = &change &m  = &m&suffix;
       %let m = %scan(&varlist, &k);
     %end;
  data &outdata ;
    set  &data;
      rename &change ;
  run;
%mend ;

* NOTE:  You want to do the sort after you drop the extra variables to minimize the sort time;
* NOTE:  You do NOT want to drop the pweight nor the replicate weights;
data temp1;
set "D:\focalchd";
keep hhid persid respid ufamid ucpsid fhosp fdent femer fment fdoct site file;
file = 1;
run;
proc print data = temp1 (obs = 5);
run;
%rname(work, temp1, temp1s, _1);
* There are three ways shown to check for unique id's:  1) sort with the nodup option, 
2) creating a flag variable and 3) creating different data sets;
* You need to check the log to make sure that no duplicate observations were deleted.;
* If observations were deleted, it means that the variables listed on the by statement do not
uniquely identify the observations and you need to choose new variables that will.;
proc sort data = temp1s nodup;
by hhid_1 persid_1;
run;
* NOTE:  You need to run the proc contents to see how many characters the id variables
are using;
proc contents data = temp1s;
run;
* NOTE:  The input function "destrings" the variables.  These variables need to be in
numeric format to avoid problems with the merge;
data temp1s;
set temp1s;
hhid=input(hhid_1,best8.);
persid=input(persid_1,best10.);
respid=input(respid_1,best10.);
ufamid=input(ufamid_1,best10.);
ucpsid=input(ucpsid_1,best10.);
drop hhid_1 persid_1 respid_1 ufamid_1 ucpsid_1;
run;

data temp2;
set "D:\adult_pr";
keep hhid persid lwhunp lfdstmp lgenass site file;
file = 1;
run;
%rname(work, temp2, temp2s, _2);
proc sort data = temp2s;
by hhid_2 persid_2;
run;
data temp2s;
set temp2s;
hhid=input(hhid_2,best8.);
persid=input(persid_2,best10.);
drop hhid_2 persid_2;
run;
proc sort data = temp2s;
by  hhid persid ;
run;
proc print data = temp2s (obs = 5);
run;
data flag;
set temp2s;
by hhid persid;
if first.hhid or first.persid then flag = 0;
else flag = 1;
run;
proc print data = flag;
where flag = 1;
run;

data temp3;
set "D:\adult_rn";
keep hhid persid site sex typeint fhosp file;
file = 1;
run;
%rname(work, temp3, temp3s, _3);
data temp3s;
set temp3s;
hhid=input(hhid_3,best8.);
persid=input(persid_3,best10.);
drop hhid_3 persid_3;
run;
proc sort data = temp3s;
by hhid persid;
run;
data nodup dup;
set temp3s;
by hhid persid;
if first.hhid or first.persid then output nodup;
else output dup;
run;
* NOTE:  If everything worked correctly, all of the observations will be in the nodup file
and none of them will be in the dup file.;
proc contents data = dup;
run;

data temp4;
set "D:\adult_rb";
keep hhid persid site age sex state uregion file;
file = 1;
run;
%rname(work, temp4, temp4s, _4);
proc sort data = temp4s nodup;
by hhid_4 persid_4;
run;
data temp4s;
set temp4s;
hhid=input(hhid_4,best8.);
persid=input(persid_4,best10.);
drop hhid_4 persid_4;
run;

* NOTE:  The order in which you merge the data files MATTERS!!!
You need to merge the file with the least restrictive merge last!
In this case, all of the files have two variables on which to merge,
except the household data file, which has only one merge variable.
Because it only has one merge variable, the household data set has the
least restrictive merge, and needs to be merged last.;

* the libname statements are due to a stat-transfer error and is a temporary work around;
* the second data set putting the data back into the temporary directory is necessary
for the macro to work correctly;
* NOTE:  You need to have the file as an .sd2 file for this to work;
libname in v6 "D:\";
libname out v8 "D:\temp";
data out.temp5;
set in.famresp;
keep hhid respid site kafdctim lcredit kaskcare file;
file = 1;
run;
data temp5;
set out.temp5;
run;
%rname(work, temp5, temp5s, _5);
proc sort data = temp5s nodup;
by hhid_5 respid_5;
run;
data temp5s;
set temp5s;
hhid=input(hhid_5,best8.);
respid=input(respid_5,best10.);
drop hhid_5 respid_5;
run;

data temp6;
set "D:\person";
keep hhid persid sex site uexprrp upfrel xsupnum file;
file = 1;
run;
%rname(work, temp6, temp6s, _6);
proc sort data = temp6s nodup;
by hhid_6 persid_6;
run;
data temp6s;
set temp6s;
hhid=input(hhid_6,best8.);
persid=input(persid_6,best10.);
drop hhid_6 persid_6;
run;

data temp7;
set "D:\househld";
keep hhid varstrat varunit state ufips uhhcount site file;
file = 1;
run;
%rname(work, temp7, temp7s, _7);
proc sort data = temp7s nodup;
by hhid_7;
run;
data temp7s;
set temp7s;
hhid=input(hhid_7,best8.);
drop hhid_7;
run;

data out.temp8;
set in.socfam;
keep hhid ufamid site jeaf jab2pov jsupkid jchildf file;
file = 1;
run;
data temp8;
set out.temp8;
run;
%rname(work, temp8, temp8s, _8);
proc sort data = temp8s nodup;
by hhid_8 ufamid_8;
run;
data temp8s;
set temp8s;
hhid=input(hhid_8,best8.);
ufamid = input(ufamid_8,best10.);
drop hhid_8 ufamid_8;
run;

data temp9;
set "D:\cpsfam";
keep hhid ucpsid site ublinc ublga ubloth uincrpov uwrk file;
file = 1;
run;
%rname(work, temp9, temp9s, _9);
proc sort data = temp9s nodup;
by hhid_9 ucpsid_9;
run;
data temp9s;
set temp9s;
hhid=input(hhid_9,best8.);
ucpsid = input(ucpsid_9,best10.);
drop hhid_9 ucpsid_9;
run;

* NOTE:  You have to give the in= variables a new name later on in the data step; 
* NOTE:  You cannot name the data set "merge";
* merging focalchd adult_pr adult_rn adult_rb person;
data merge1;
merge temp1s(in=t1) temp2s(in=t2) temp3s(in=t3) temp4s(in=t4) temp6s(in=t6);
by hhid persid;
t1a = t1;
t2a = t2;
t3a = t3;
t4a = t4;
t6a = t6;
run;

proc contents data = merge1;
run;

proc sort data = merge1;
by hhid ufamid;
run;

* merging in socfam data;
data merge2;
merge merge1 temp8s(in=t8);
by hhid ufamid;
t8a = t8;
run;

proc sort data = merge2;
by hhid ucpsid;
run;

* merging in cpsfam data;
data merge3;
merge merge2 temp9s(in=t9);
by hhid ucpsid;
t9a = t9;
run;

proc sort data = merge3;
by hhid respid;
run;

* merging in famresp data;
data merge4;
merge merge3 temp5s(in=t5);
by hhid respid;
t5a = t5;
run;

proc sort data = merge4;
by hhid;
run;

* merging in household data;
data merge5;
merge merge4 temp7s(in=t7);
by hhid;
t7a = t7;
run;

* The procs below are used for verifying that the merge worked as expected;
proc freq data = merge5;
tables t1a*t2a*t3a*t4a*t5a*t6a*t7a*t8a*t9a / list missing;
run;

proc contents data = merge5;
run;

options nolabel;
proc means data = merge5 ;
run;

proc freq data = merge5;
tables t1a*t2a*t3a*t4a*t5a*t6a*t7a*t8a*t9a/list missing;
run;

* saving the temporary data set down to the hard drive and renaming it;
data "D:\nsaf99";
set merge5;
run;

/*

NOTE:  There are 193,805 observations in the final data set.

The table below is from the proc freq above.  It indicates which files contributed
observations.  For example, the first line indicates that files 7 and 9 contributed
17,573 observations to the final data set.

The FREQ Procedure

                                                                  Cumulative  Cumulative
t1a  t2a  t3a  t4a  t5a  t6a  t7a  t8a  t9a  Frequency   Percent   Frequency    Percent
----------------------------------------------------------------------------------------
  .    .    .    .    0    .    1    .    1     17573      9.07       17573       9.07
  .    .    .    .    0    .    1    1    0     14701      7.59       32274      16.65
  .    .    .    .    1    .    1    .    .     16765      8.65       49039      25.30
  0    0    0    0    0    1    1    0    0     34158     17.62       83197      42.93
  0    1    0    0    0    1    1    0    0     22107     11.41      105304      54.34
  0    1    1    0    0    1    1    0    0     35798     18.47      141102      72.81
  0    1    1    1    0    1    1    0    0     16765      8.65      157867      81.46
  1    0    0    0    1    1    1    1    1     35889     18.52      193756      99.97
  1    1    0    0    1    1    1    1    1        25      0.01      193781      99.99
  1    1    1    0    1    1    1    1    1        24      0.01      193805     100.00

*/

How to cite this page

Report an error on this page

UCLA Researchers are invited to our Statistical Consulting Services
We recommend others to our list of Other Resources for Statistical Computing Help
These pages are Copyrighted (c) by UCLA Academic Technology Services


The content of this web site should not be construed as an endorsement of any particular web site, book, or software product by the University of California