options ls=78;
title "Cluster Analysis - Woodyard Hammock - K-Means";

 /* After reading in the data, an ident variable is created as the
  * row number for each observation. This is neeed for the clustering algorithm.
  * The drop statement removes several variables not used for this analysis.
  */

data wood;
  infile 'D:\Statistics\STAT 505\data\wood.csv' firstobs=2 delimiter=',';
  input x y acerub carcar carcor cargla cercan corflo faggra frapen
        ileopa liqsty lirtul maggra magvir morrub nyssyl osmame ostvir 
        oxyarb pingla pintae pruser quealb quehem quenig quemic queshu quevir 
        symtin ulmala araspi cyrrac;
  ident=_n_;
  drop acerub carcor cargla cercan frapen lirtul magvir morrub osmame pintae
       pruser quealb quehem queshu quevir ulmala araspi cyrrac;
  run;

 /* The observations are sorted by their ident value.
  */

proc sort data=wood;
  by ident;
  run;

 /* The fastclus is a non-hierarchical procedure.
  * The maxclusters option is the number it works with
  * throughout the algorithm. The radius option specifies
  * the minimum distance between new seeds.
  * The maxiter option specifies the number of iterations.
  */

proc fastclus data=wood maxclusters=4 radius=20 maxiter=100 out=clust;
  var carcar corflo faggra ileopa liqsty maggra nyssyl ostvir oxyarb 
      pingla quenig quemic symtin;
  id ident;
  run;