options nocenter nodate nonumber;
data hsb2;
  set 'd:\hsb2'; /*change this to the location where you saved the hsb2 data set*/
run;
*scatterplot;
*basic, purple dots;
goptions reset=all;
symbol v=dot c=purple;
proc gplot data=hsb2;
  plot write*math;
run;
quit;
*creating an outlier;
data outlier;
  if _n_ = 1 then do;
  write = 50;
  math = 99;
  id = 201;
  end;
  output;  
  set hsb2;
run;
*using labels for the symbols to identify the outlier;
goptions reset=all;
symbol1 pointlabel = ("#id") font=simplex value=none;
proc gplot data=outlier;
  plot write*math=1;
run;
quit;
*scatterplot matrix;
proc insight data=hsb2;
  scatter write math socst female * write math socst female ;
run; 
*back to single scatter plot with different colors/symbols for gender;
goptions reset=all;
symbol1 v=circle c=blue;
symbol2 v=dot  c=red;
axis label=(a=90 'Writing Scores');
proc gplot data=hsb2;
  plot write*math=female / vaxis=axis;
run;
quit;
*Box-plots;
proc sort data=hsb2 out=sort;
 by prog;
run;
goptions reset=all;
proc boxplot data=sort;
 plot (math socst)*prog / boxwidth=10;
run;
*demonstrate the nice options;
proc boxplot data=sort;
 plot (math socst)*prog / boxstyle=schematicid nohlabel cboxes=blue cboxfill=yellow
                          idcolor=red boxwidth=10;
 id id;
run;
*other cool graphs through proc univariate;
*The red line is the kernel density based on our sample. ;
*The blue line is the normal curve.; 
goptions reset=all;
proc univariate data=hsb2 noprint;
 histogram socst/ cfill=yellow normal kernel color = red cbarline=blue;
run;
*REGRESSION. ;
*Creating the math^2 predictor to be used in the regression.;
data hsb;
  set hsb2;
  math2 = math*math;
run;
*Plot statement #1: residual versus predicted, residual versus math^2. ;
*Resid v. math2 if any systematic variance = add as predictor.;
*Plot statement #2: Studentized deleted residuals versus predicted.;
*Plot statement #3: Normal probability plot of the residuals.;
*Emphasizes deviations from normality in the middle. ;
*Plot statement #4: Normal quantile probability plot of the residuals.;
*Emphasizes deviations from normality at the tails.;
goptions reset=all;
proc reg data=hsb;
  var math2;
  model write = female math socst;
  plot r.*p. r.*math2/ cline=black;
  plot rstudent.*p. / vref=-2.5 2.5 cline=black;
  plot npp.*r. /  modellab=' Quantile plot:';
  plot r.*nqq. / modellab=' Quantile plot:' noline;
  symbol v=circle c=red h=.8;
run;
quit;
*Different predicted values for males and females;
proc reg data=hsb2;
  model write = female math socst;
  output out=temp p=predict;
run;
quit;
symbol1 v=dot c=red;
symbol2 v=circle c=blue;
axis1 label=(a=90 'Predicted values');
proc gplot data=temp;
  plot predict*math=female / vaxis=axis1;
  plot predict*socst=female / vaxis=axis1;
run;
quit;
*MODEL SELECTION;
*using Mallow's C in model selection;
data selection;
  set hsb2;
  math2 = math*math;
  mathf = math*female;
  mathsch = math*schtyp;
  mathsci = math*science;
  sciencef = science*female;
  progsch = prog*schtyp;
run;
symbol v=circle c=black;
proc reg data=selection;
  model write = math socst female schtyp prog science math2 mathf mathsch mathsci 
                sciencef progsch/ selection=rsquare ;
  plot cp.*np. / chocking=red cmallows=blue vaxis=0 to 15 by 5;
run;
quit;
*Using R-squared model selection;
proc reg data=selection;
  model write = math socst female schtyp prog science math2 mathf mathsch mathsci 
                sciencef progsch / selection=rsquare cp best=3 start=2 stop=6;
run;
quit;
*SURVIVAL ANALYSIS;
data uis;
  set 'd:\uis'; /*change this to the location where you saved the uis data set*/
run;
*Kaplan-Meier curves for each level of treatment;
goptions reset=all;
proc lifetest data=uis plots=(s);
  time time*censor(0);
  strata treat;
  symbol c=red;
run;
*We want to look at the survival function for people who are 30 yrs old, have only 5 ;
*previous drug treatments, received their treatment at site A with one curve for each ;
*treatment group. ;
data cov;
  age = 30;
  ndrugtx = 5;
  treat = 1;
  site = 0;
  agesite = 0;
run;
proc phreg data=uis noprint;
  model time*censor(0) = age ndrugtx treat site agesite; 
  agesite = age*site;
  baseline out=surv covariates=cov survival=surv/ nomean;
run;
data cov_short;
  age = 30;
  ndrugtx = 5;
  treat = 0;
  site = 0;
  agesite = 0;
run;
proc phreg data=uis noprint;
  model time*censor(0) = age ndrugtx treat site agesite; 
  agesite = age*site;
  baseline out=surv_short covariates=cov_short survival=surv/ nomean;
run;
data combo;
  set surv surv_short;
run;
data combo;
  if _n_ = 1 then do;
  time = 1172;
  surv =  0.08429;
  treat = 0;
  end;
  if _n_ = 2 then do;
  time = 1172;
  surv = 0.15060;
  treat = 1;
  end;
  output;
  set combo;
run;
proc sort data=combo;
  by time;
run;
goptions reset=all;
symbol1 c=red v=triangle h=.6 i=stepjll;
symbol2 c=blue v=circle h=.6 i=stepjll;
axis1 label=(a=90 'Survivorship function');
proc gplot data=combo;
 plot surv*time=treat / vaxis=axis1;
run;
quit;
*Plotting 3D graphs;
*Creating the color variable "colorval";
data color;
  set hsb2;
  if prog=1 then colorval="red";
  else if prog=2 then colorval="blue";
  else colorval="green";
run;
proc g3d data=color;
   scatter socst*math=write / shape='pillar' color=colorval caxis=blue;
   scatter socst*math=write / color=colorval caxis=blue noneedle;
run;
quit;
*Creating the math2, soc2 and math and socst interaction variables.;
data interaction;
  set hsb2;
  mathsc = math*socst;
  math2 = math**2;
  soc2 = socst**2;
run;
*Fitting the regression and getting the parameter estimates.;
proc reg data=interaction;
  model write = math socst mathsc female math2 soc2; 
run;
quit;
*Getting range of the predictors;
proc means data=hsb2 max min;
  var math socst;
run;
*Filling in values to make a nice graph.  If we only the use the values already in the 
data set the graph will not be a surface but rather a bunch of scatter lines.  To fix this
we create a data set with values for math and sosct at regular small intervals across the
range of each variable.  Then we create the predicted values for each gender group using 
the parameter estimates obtained from the proc reg.  This will result in a graph with a nice
surface plot.;
data graph;
   do math= 30 to 75 by 2.5;
      do socst= 25 to 75 by 2.5;
		 yhatf = 6.77 + .74*math +.34*socst - .009*math*socst + .002*math**2 + .004*socst**2; 
		 yhatm = 2 + .74*math +.34*socst - .009*math*socst + .002*math**2 + .004*socst**2; 
         output;
      end;
   end;
run;
proc print data=graph (obs=10);
run;
proc g3d data=graph;
 plot math*socst = yhatf / rotate=0 to 360 by 15 cbottom=black ctop=red caxis=blue 
                            xticknum=5 yticknum=5 grid;
run;
quit;

*The great website of various graphs examples from SAS: ;
* http://www.sas.com/service/techsup/sample/sample_graph.html ;
