*********************************************************
*                   Entering Data                       *
*********************************************************; 

* Import Excel files;
proc import datafile="c:\SAS_data\hs0.xls" out=hs0;
run;

* Infile a comma-separated-values (.csv) file;
data temp;
  infile 'c:\SAS_data\hs0.csv' delimiter=',' dsd;
  input gender id race ses schtyp prgtype $ read write math science socst ;
run;

proc print data = temp (obs=10);
run;

* Infile a fixed format dataset;
data fixed;
  infile "c:\SAS_data\schdat.fix";
  input id 1-2 a1 3-4 t1 5-6 gender 7 a2 8-9 t2 10-11 tgender 12;
run;

proc print data = fixed;
run;

* Enter data directly into SAS using input;
data hsb10;
  input id female race ses schtype $ prog
        read write math science socst;
datalines;
 147 1 1 3 pub 1 47  62  53  53  61
 108 0 1 2 pub 2 34  33  41  36  36
  18 0 3 2 pub 3 50  33  49  44  36
 153 0 1 2 pub 3 39  31  40  39  51
  50 0 2 2 pub 2 50  59  42  53  61
  51 1 2 1 pub 2 42  36  42  31  39
 102 0 1 1 pub 1 52  41  51  53  56
  57 1 1 2 pub 1 71  65  72  66  56
 160 1 1 2 pub 1 55  65  55  50  61
 136 0 1 2 pub 1 65  59  70  63  51
;
run;

proc print data=hsb10;
run;

* Save temporary dataset "temp" as a permanent file;

data 'c:\SAS_data\hs0';
  set temp;
run;

* Read a SAS permanent file using the path and file name;
proc print data='c:\SAS_data\hs0' (obs=10);
run;

*********************************************************
*                   Exploring Data                      *
*********************************************************; 

* Set output to be left justified rather than centered;
options nocenter;

* Examine data using proc contents and proc print;
proc contents position data='c:\SAS_data\hs0';
run;
proc print data='c:\SAS_data\hs0' (obs=20);
run;
proc print data='c:\SAS_data\hs0';
  var gender id race ses schtyp prgtype read;
run; 

* Create a temporary dataset called hs0 ;
data hs0;
	set 'c:\SAS_data\hs0';
run;


* Descriptive statistics with proc means and proc univariate;
proc means data=hs0;
run;

proc univariate data=hs0;
  var read write;
run;

* means for a subset of variables using var;
proc means data=hs0 n mean median std var;
  var read math science write;
run;

* means with a subset of cases using where;
proc means data=hs0 n mean median std var;
  where read>=60;
  var read math science write;
run;

* means broken down by group (prgtype) using class;
proc means data=hs0 n mean median std var;
  class prgtype;
  var read math science write;
run;

*  histogram with normal curve overlay from proc univariate;
proc univariate data=hs0;
  var write;
  histogram / normal;
run;

* boxplot;
proc sort data='c:\SAS_data\hs0';
  by prgtype;
run;

proc boxplot data='c:\SAS_data\hs0';
  plot write*prgtype / boxstyle=schematic boxwidth=10;
run;

* Frequency distribution table;
proc freq data=hs0;
  table ses;
run;

*  Frequency distribution table plus cumulative frequency graph;
* Not available in SAS 9.1.3 and earlier;
ods graphics on;
proc freq data=hs0;
  table ses / plots=freqplot;
run;
ods graphics off;

* proc freq for write (a continuous variable);
proc freq data=hs0;
  table write;
run;

* proc freq for multiple variables at the same time;
proc freq data=hs0;
  table gender schtyp prgtype;
run;

* a crosstab using proc freq;
proc freq data=hs0;
  table prgtype*ses;
run;

* correlations using proc corr with pairwise 
deletion of missing observations (default) ;
proc corr data=hs0; 
  var write read science;
run;

* correlations using proc corr with listwise 
deletion of missing observations (nomiss option) ;
proc corr data=hs0 nomiss; 
  var write read science;
run;

* a scatter plot ;
proc gplot data=hs0;
	plot write*read;
run;
quit;

*********************************************************
*                   Modifying Data                      *
*********************************************************; 

* Examine the dataset;
proc contents data = "c:\SAS_data\hs0";
run;

* Create value labels for the variable schtyp;
proc format;
  value scl 1 = "public"
            2 = "private";
run;

* Frequency table using the labels with a format statement;
proc freq data = "c:\SAS_data\hs0";
  tables schtyp;
  format schtyp scl.;
run;

*  permanently apply a value label to a variable in a data step;
data hs0b;
	set "c:\SAS_data\hs0" ;
	format schtyp scl.;
run;


* In one data step label the dataset, label the variable schtyp;
data hs0b(label="High School and Beyond, 200 cases");
  set hs0b;
  label schtyp = "type of school";
run;

proc contents data = hs0b;
run;

* Rename schtype to public and female to girl in a temporary dataset hs0b;
data hs0b;
   set hs0b (rename=(schtyp=public gender=female));
run;

proc contents data=hs0b;
run;

*  Proc format followed by a dataset that performs a variety of tasks;
proc format;
  * create value labels for schtyp ;
  value scl 1 = "public"
            2 = "private";

  * create value labels for grade ;
  value abcdf 0 = "F" 
              1 = "D" 
              2 = "C" 
              3 = "B" 
              4 = "A";

  * create value labels for female ;
  value fm 1 = "female"
           0 = "male";
run;

* create data file hs1, label it, and rename the 
variable gender to female ;
data hs1(label="High School and Beyond" rename=(gender=female)) ;

  * read in the sas file c:\SAS_data\hs0;
  set "c:\SAS_data\hs0";  

  * the label statement labels the variable schtyp ;
  label schtyp = "type of school";
  
  * apply value labels to schtyp;
  format schtyp scl.;

  * the if-then statements create a new variable, called prog,
    which is numeric variable ;
  if prgtype = "academic" then prog = 1;
  if prgtype = "general" then prog = 2;
  if prgtype = "vocati" then prog = 3;


  * the label statement labels the variable prog ;
  label prog = "type of program";

  * the label statement labels the variable female ;
  label female = "student's gender";
  
  * apply value labels to female;
  format female fm.;

  * the if statement recodes values of 5 in variable race to be missing (.) ;
  if race = 5 then race = .;

  * create a variable total that is the sum of read, write, math, and science ;
  total = read + write + math + science;

  * the if-then statements recode grade ; 
  if (total < 80) then grade = 0;  
  if (80  <= total  <  110) then grade = 1; 
  if (110 <= total  <  140) then grade = 2; 
  if (140 <= total  <  170) then grade = 3; 
  if (total  >= 170) then grade = 4;
  if (total = .) then grade = .;

  * label the variable grade ;
  label grade = "combined grades of read, write, math, and science";
  
  * apply value labels to variable grade;
  format grade abcdf.;
  
run;

* Check the results;
proc contents data = hs1;
run;

proc print data = hs1 (obs = 20);
run;

proc freq data = hs1;
  tables schtyp female;
run;

* Save temporary dataset as a permanent dataset;
data 'c:\SAS_data\hs1';
	set hs1;
run;

* standardize read and write using proc standard;
proc standard data = hs1 mean=0 std=1 out=hs1b;
   var read write ;
run;

proc print data=hs1b (obs=10);
run;

* create a total using the sum() function;
data hs1b;
	set hs1;
	total2 = sum(of read write math science);
run;

proc print data=hs1b (obs=20);
	var read write math science socst total total2;
run;

* create a new variable equal to the mean of read by group (prog);
* first sort the data by prog;
* then create a new dataset that contains the mean of read by group (prog);
proc sort data = hs1;
   by prog;
run;

proc means data = hs1 mean ;
   var read;
   by prog;
   output out = readmean mean=m;
run;

* look at the dataset of means;
proc print data = readmean;
run;

* sort the data in hs1 by group (prog);
proc sort data = hs1;
by prog;
run;

* merge the two data sets, matching on prog and 
drop extra variables (_TYPE_ and _FREQ_) from readmean;
data merged;
merge hs1 readmean;
by prog;
drop _TYPE_ _FREQ_;
run;

* look at the data;
proc print data = merged (obs=20);
  var prog m read;
run;


*********************************************************
*                   Managing Data                       *
*********************************************************;
 
* 2.1 Create a library;
libname mylib "c:\SAS_data\";

* Do proc print using the library and the path name to show 
* they both point to the same file;
proc print data=mylib.hs1 (obs=10);
  var write read science;
run;

* 2.2 Selecting cases using where;
* change to use libname instead of file path;
data mylib.goodread;
  set mylib.hs1;
  where (read >=60);
run;

proc means data=mylib.goodread;
  var read;
run;

* 2.3 Keeping a subset of variables;
data mylib.hskept;
  set mylib.goodread;
  keep id female read write;
run;

proc contents data=mylib.hskept;
run;

* 2.4 Dropping a subset of variables ;
data mylib.hsdropped;
  set mylib.goodread;
  drop ses prog;
run;

proc contents data=mylib.hsdropped;
run;

* 2.5 Appending datasets;
* Look at frequency of variable "female" in each file ;
proc freq data=mylib.hsmale;
  tables female;
run;

proc freq data=mylib.hsfemale;
  tables female;
run;

* Use DATA step to combine the two files and save them as hsmasters ;
data mylib.hsmaster;
  set mylib.hsmale mylib.hsfemale;
run;
* Now you should have a file with both males and females;
proc freq data=mylib.hsmaster;
  tables female;
run;

* 2.6 Merging datasets;
* examine the two datasets;
proc print data=mylib.hsdem (obs=10);
run;

proc print data=mylib.hstest (obs=10);
run;

* sort both files by the variable that identifies the cases in each file (id);
proc sort data=mylib.hsdem out=dem;
  by id;
run;

proc sort data=mylib.hstest out=test;
  by id;
run;
* merge the datasets;
data mylib.hsall;
  merge dem test;
  by id;
run;

proc contents data=mylib.hsall;
run;


*********************************************************
*                   Analyzing Data                      *
*********************************************************; 

* 2.1 Chi-squared test;
proc freq data='c:\SAS_data\hs0';
  table prgtype*ses / chisq expected;
run; 

* 2.2 t-tests
* one sample t-test;
proc ttest data='c:\SAS_data\hs1' H0=50;
  var write;
run;

* paired t-test;
proc ttest data='c:\SAS_data\hs1';
  paired write*read;
run;

* two sample independent t-test;
proc ttest data='c:\SAS_data\hs1';
  class female;
  var write;
run;


*  2.3 ANOVA ;
* Oneway ANOVA with type III sums of squares only;
proc glm data='c:\SAS_data\hs1';
  class prog;
  model write=prog / ss3;
run;
quit;

* ANCOVA ;
proc glm data='c:\SAS_data\hs1';
  class prog;
  model write = read prog / ss3;
run;
quit;

* 2.4 Regression ;
* OLS ;
proc reg data='c:\SAS_data\hs1';
  model write = female read;
run;
quit;
* OLS with diagnostic plots, this code also outputs a temporary 
* dataset (temp) that contains the predicted values of math and 
the residuals ;

proc reg data ='c:\SAS_data\hs1';
  model math = write socst;
  plot r.*p.;
  output out=temp p=predict r=resid;
run;
quit;

* look at the temporary dataset (temp);
proc print data=temp (obs=20);
  var math predict resid;
run;


* 2.5 Logistic regression ;
* create a dichotomous variable honcomp;
data hs2;
  set 'c:\SAS_data\hs1';
  honcomp = (write >= 60);
run;

* Logistic regression with descending option (so model predicts 1s rather than 0s);
proc logistic data=hs2 descending;
  model honcomp = female read;
run;

* 2.6 Nonparametric tests ;
* signtest (nonparametric analog of the single sample t-test);
proc univariate data='c:\SAS_data\hs1' mu0=50;
  var write;
run;

* signrank test (nonparametric analog of the paired t-test);
* create the difference variable (diff);
data hs1c;
  set 'c:\SAS_data\hs1';
  diff = read - write;
run;

* test that diff=0;
proc univariate data=hs1c;
  var diff;
run;

* ranksum (nonparametric analog of the independent two-sample t-test);
proc npar1way data='c:\SAS_data\hs1';
  class female;
  var write;
run;

* kruskal wallis  (nonparametric analog of the one-way ANOVA);
proc npar1way data='c:\SAS_data\hs1';
  class ses;
  var write;
run;
