options nocenter nodate nonumber formdlim="-";
*Arrays Seminar;

*Recoding variables;
*Note: single dash in variable list means numerically consecutive;
data faminc;
  input famid faminc1-faminc12 ;
cards;
1 3281 3413 3114 2500 2700 3500 3114 3319 3514 1282 2434 2818
2 4042 3084 3108 3150 3800 3100 1531 2914 3819 4124 4274 4471
3 6015 6123 6113 6100 6100 6200 6186 6132 3123 4231 6039 6215
;
run;
/*heading option controls direction of col headings*/
proc print data=faminc heading=H; 
run;
*Using if-then manually;
data recode_manual;
  set faminc;
  if faminc1 < 3000 then faminc1=.;
  if faminc2 < 3000 then faminc2=.;
  if faminc3 < 3000 then faminc3=.;
  if faminc4 < 3000 then faminc4=.;
  if faminc5 < 3000 then faminc5=.;
  if faminc6 < 3000 then faminc6=.;
  if faminc7 < 3000 then faminc7=.;
  if faminc8 < 3000 then faminc8=.;
  if faminc9 < 3000 then faminc9=.;
  if faminc10 < 3000 then faminc10=.;
  if faminc11 < 3000 then faminc11=.;
  if faminc12 < 3000 then faminc12=.;
run;
proc print data=recode_manual noobs heading=H;
run;
*Recoding variables using an array;
data recode_array;
  set faminc;
  array Afaminc(12) faminc1-faminc12;
  do i = 1 to 12; 
    if Afaminc[i] < 3000 then Afaminc[i] = . ;
  end;
  drop i;
run;
proc print data=recode_array noobs heading=H;
run;
********************************************************;
*Reverse items on a -3 to +3 scale using an array;
*Input the score data set;
data score;
  input item1 item2 item3 item4;
cards;
-2   1   -3   0
-1   2   -2   1
 0  -1   -3  -1
;
run;
proc print data=score;
run;
data score_array1;
  set score;
  array item(4) item1-item4;
  do i = 1 to 4;
   item[i] = -1*item[i];
  end;
run;
proc print data=score_array1;
run;
********************************************;
*Computing new variables;
*computing the tax income variables manually;
data tax_manual;
 set faminc;
  taxinc1 = faminc1 * .10 ;
  taxinc2 = faminc2 * .10 ;
  taxinc3 = faminc3 * .10 ;
  taxinc4 = faminc4 * .10 ;
  taxinc5 = faminc5 * .10 ; 
  taxinc6 = faminc6 * .10 ;
  taxinc7 = faminc7 * .10 ;
  taxinc8 = faminc8 * .10 ;
  taxinc9 = faminc9 * .10 ;
  taxinc10= faminc10 * .10 ;
  taxinc11= faminc11 * .10 ;
  taxinc12= faminc12 * .10 ;
run;
proc print data=tax_manual noobs heading=H;
  var famid faminc1-faminc12 taxinc1-taxinc12;
run;
*computing the same tax income variables using an array;
*The second array Ataxinc because we are creating 12 new variables;
data tax_array;
  set faminc;
  array Afaminc(12) faminc1-faminc12; /* existing vars */
  array Ataxinc(12) taxinc1-taxinc12; /* new vars */
  do month = 1 to 12;
   Ataxinc[month] = Afaminc[month]*0.1;
  end;
run;
proc print data=tax_array noobs heading=H;
  var famid faminc1-faminc12 taxinc1-taxinc12;
run;
**********************************************************;
*Collapsing over variables;
*Creating the total income per quarter variables manually;
data quarter_manual;
  set faminc;
  incq1 = faminc1 + faminc2 + faminc3;
  incq2 = faminc4 + faminc5 + faminc6;
  incq3 = faminc7 + faminc8 + faminc9;
  incq4 = faminc10 + faminc11 + faminc12; 
run;
proc print data=quarter_manual;
  var incq1 faminc1-faminc3;
run;
*Creating the total income per quarter variables using arrays;
data quarter_array;
  set faminc;
  array Afaminc(12) faminc1-faminc12; /* existing vars */
  array Aquarter(4) incq1-incq4; /* new vars */
  do q = 1 to 4;
   Aquarter[q] = Afaminc[3*q-2] + Afaminc[3*q-1] + Afaminc[3*q];
  end;
run;
/* For q=1:  Aquarter[1] = Afaminc[3*1-2] + Afaminc[3*1-1] + Afaminc[3*1]
                         = Afaminc[1] + Afaminc[2] + Afaminc[3] 
   For q=2:  Aquarter[2] = Afaminc[3*2-2] + Afaminc[3*2-1] + Afaminc[3*2]
                         = Afaminc[4] + Afaminc[5] + Afaminc[6] */  
proc print data=quarter_array;
  var incq1 faminc1-faminc3;
run;
***********************************************************;
*ID patterns across variables using arrays;
*ID months were income was less than half of previous month;
*Store information in dummy variables lowinc2-lowinc12;
*loop over months 2-12 (month 1 has no previous month!);
*Variable ever indicates if this ever happened;
data pattern;
  set faminc;
  length ever $ 4;
  array Afaminc(12) faminc1-faminc12; /*existing vars*/
  array Alowinc(2:12) lowinc2-lowinc12; /* new vars */
  do m = 2 to 12;
   if Afaminc[m] < (Afaminc[m-1] / 2) then Alowinc[m] = 1;
   else Alowinc[m] = 0;
  end;
  sum_low = sum(of lowinc:); /*sums over all vars with lowinc as part of name */
  if sum_low > 0 then ever='Yes';
  if sum_low = 0 then ever='No';
  drop m sum_low;
run;
proc print data=pattern noobs heading=H;
run;
************************************************************;
*Reshaping wide to long;
*Reshaping wide to long creating only one variable--Manually;
data wide; 
  input famid faminc96 faminc97 faminc98 ; 
cards; 
1 40000 40500 41000 
2 45000 45400 45800 
3 75000 76000 77000 
; 
run;
data long_manual;
  set wide;
  year=96;
  faminc=faminc96;
  output;
  year=97;
  faminc=faminc97;
  output;
  year=98;
  faminc=faminc98;
  output;
run;
proc print data=long_manual;
  var famid year faminc;
run;
*illustrating the importance of all the output statements;
data problem;
  set wide;
  year=96;
  faminc=faminc96;
  *output;
  year=97;
  faminc=faminc97;
  *output;
  year=98;
  faminc=faminc98;
  output;
run;
proc print data=problem;
  var famid year faminc;
run;
*reshaping wide to long creating only one variable using arrays;
*Note: we need the output statement inside the do loop;
*because we want to output after each year, just like in the above ex.;
data long_array;
  set wide;
  array Afaminc(96:98) faminc96 - faminc98;
  do year = 96 to 98;
   faminc = Afaminc[year];
   output; 
  end;
  drop faminc96-faminc98;
run;
proc print data=long_array;
run;
*Reshaping wide to long creating multiple variables (including string vars) using arrays;
data multi_wide;
  length  debt96 $ 3 debt97 $ 3 debt98 $ 3;
  input famid faminc96 faminc97 faminc98 spend96 spend97 spend98 debt96 debt97 debt98 ; 
cards; 
1 40000 40500 41000 38000 39000 40000 yes yes no 
2 45000 45400 45800 42000 43000 44000 yes no  no 
3 75000 76000 77000 70000 71000 72000 no  no  no 
; 
run;
proc print data=multi_wide;
run;
data multi_long;
  set multi_wide;
  length debt $ 3;
  array Afaminc(96:98) faminc96-faminc98;
  array Aspend(96:98) spend96-spend98;
  array Adebt(96:98) debt96-debt98;
  do year = 96 to 98;
   faminc = Afaminc[year];
   spend = Aspend[year];
   debt = Adebt[year];
   output;
  end;
  drop faminc96-faminc98 spend96-spend98 debt96-debt98;
run;
proc print data=multi_long;
  var famid year faminc spend debt;
run;
*Reshaping wide to long in presence of character suffixes;
*In the above example we had numeric suffixes (96, 97 and 98);
*We can reshape even if we have character suffixes such (old, now, future);
data character;
  length name_old $ 24 name_now $ 24 name_future $ 24;
  input id name_old $ name_now $ name_future $ inc_old inc_now inc_future;
cards;
1  Ramon  Martin  Martin_Sheen  23000  50000  700000
2  John  Johnnie  J_boy  10000 20000 600000
3  Mary_Cathleen  Bo  Bo_Derek  15000 40000 250000
;
run;
*double dash means positionally consecutive;
proc print data=character noobs;
  var id name_old--name_future inc_old--inc_future; 
run;
data character_array;
  set character;
  length name $ 24;
  array Aname(3) $ name_old name_now name_future;
  array Aincome(3) inc_old inc_now inc_future;
  do time= 1 to 3;
   name = Aname[time];
   income = Aincome[time];
   output;
  end;
run;
proc format;
  value t_format 1='old' 2='now' 3='future';
run;
proc print data=character_array noobs;
  format time t_format.;
  var id time name income; 
run;
******************************************************************************;
*Understanding the functions first. and last. as well as the retain statement.;
*In order to understand how to use arrays to reshape from long to wide we ;
*will need to understand how the first. and last. functions work as well ;
*as understand how the retain statement works.;
*Looking at examples of the retain statement.;
data missings;
  input id measurement;
cards;
1  .
1  2
3  .
2  3
3  4
2  .
3  .
1  .
3  5
3  6
;
run;
data ex_retain;
  set missings;
  retain new_meas 0; /*try running it with this line as a comment*/
  if measurement ne . then new_meas = measurement;
run;
proc print data=ex_retain;
run;
data ex2_retain;
  set missings;
  retain new1 0; /*try running it with this line commented out!*/
  if measurement ne . then new1 = new1 + measurement;
run;
proc print data=ex2_retain;
run;
*Examples of using the first. and last. functions.;
*We will be using first.id and last.id so sorting on id is necessary;
proc sort data=missings out=sort_miss;
  by id;
run;
*The by statement must precede the function call to first. and last.;
data ex1;
  set sort_miss;
  by id;
  if first.id then first=1;
   else first=0;
  if last.id then last=1;
   else last=0;
run;
proc print data=ex1;
run;
*Combining the first. function with a retain statement to get a running sum and count.;
data kids;
  length kidname $ 4;
  input famid kidname birth_order wt;
cards;
1 Beth 1  60
1 Barb 3  20
4 Sam  1 100
4 Stu  2  90
1 Bob  2  40
3 Pete 1  60
3 Phil 3  20
2 Andy 1  80
3 Pam  2  40
2 Al   2  50
2 Ann  3  20
;
run;
*we will be using first.famid and last.famid so sorting on famid is necessary;
proc sort data=kids out=sort_kids;
  by famid;
run;
data retain1;
  set sort_kids;
  retain sumwt count; /*carry over the value from previous observation to next obs*/
  by famid; /* requires sorting */
  if first.famid then do; /*at first obs of each family set sumwt and count equal to 0*/
    sumwt=0;
	count=0;
  end;
  sumwt = sumwt + wt;
  count = count + 1;
  meanwt = sumwt/count;
  *if last.famid then output; /*output only the last obs for each family*/
run;
proc print data=retain1 noobs;
  var famid kidname wt sumwt count meanwt;
run;
*outputting only the final sumwt, count and meanwt per family;
data retain2;
  set retain1;
  by famid;
  if last.famid then output; /*output only the last obs for each family*/
run;
proc print data=retain2 noobs;
  var famid sumwt count meanwt;
run;
*******************************************************************;
*Reshaping long to wide using arrays;
*We will use the long_array data set created from the wide data set;
*and we will reshape it back to the original wide format;
proc print data=long_array;
run;
*we will be using first.famid so sorting on famid is necessary;
proc sort data=long_array out=long_sort;
  by famid;
run;
data wide_array1;
  set long_sort;
  by famid;
  retain faminc96-faminc98;
  array Afaminc(96:98) faminc96-faminc98;
  if first.famid then do;
    do i = 96 to 98;
	 Afaminc[i] = .;
	end;
  end;
  Afaminc(year) = faminc; /*looping across values in the variable year*/
  *if last.famid then output; /*outputs only the last obs per family*/
  drop year faminc i;
run;
proc print data=wide_array1 noobs;
run;
data wide_array;
  set long_sort;
  by famid;
  retain faminc96-faminc98;
  array Afaminc(96:98) faminc96-faminc98;
  if first.famid then do;
    do i = 96 to 98;
	 Afaminc[i] = .;
	end;
  end;
  Afaminc(year) = faminc; /*looping across values in the variable year*/
  if last.famid then output; /*outputs only the last obs per family*/
  drop year faminc i;
run;
proc print data=wide_array noobs;
run;
proc print data=wide noobs;
run;
*************************************************************;
*A more subtle use of arrays.  An issue in SAS data management is that 
*we can not access multiple observations at one time.  Thus, we can not do
*comparisons across observations.
*One solution to this problem is to transpose the data from long to wide;
*then we can use the array to do the comparisons very easily.;
*Goal: to compare each observation with the previous and the next observation;
*If they are the same then flag the observation.;
data real_life;
  input person topicA;
cards;
1   0  
1   1  
3  -1  
1   0  
2   0  
1   1  
2  -1  
2  -1  
3   0  
3   1  
4   0  
1   1  
4   1  
4   0  
2  -1  
4   0  
4   0  
1  -1  
;
run;
proc print data=real_life noobs;
run;
*First we need to number the observations within each person;
*we must sort on person since we will be using first.person;
proc sort data=real_life out=sort_real;
  by person;
run;
data count_real;
  set sort_real;
  retain count;
  by person;
  if first.person then count = 0;
  count = count + 1;
run;
proc print data=count_real noobs;
run;
data wide_real;
  set count_real;
  array AtopicA(6) topicA_1-topicA_6;
  retain topicA_1-topicA_6;
  by person;
  if first.person then do;
    do i = 1 to 6;
	 AtopicA[i] = .; /* setting everything to missing */
	end;
  end;
  AtopicA(count) = topicA; /*looping across values in the variable count*/
  if last.person then output; /* outputs only the last obs per person */
run;
proc print data=wide_real noobs;
  var person topicA_1-topicA_6;
run;
*Now, let's find the people who have the same value for 3 observations in a row;
data three;
  set wide_real;
  array topic(6) topicA_1-topicA_6;
  do i = 2 to 5;
   if topic[i-1] ne . & topic[i] ne . & topic[i+1] ne . & 
      topic[i]=topic[i-1] & topic[i]=topic[i+1] then flagA=1;
  end;
  if flagA=. then flagA=0;
run;
proc print data=three noobs;
  var person topicA_1-topicA_6 flagA;
run;
