You are on page 1of 4

/* HIERARCHICAL CLUSTER ANALYSIS */

/* EUCLIDEAN DISTANCES are used by default in proc cluster */

proc cluster data=chest method=centroid nonorm ccc pseudo


rmsstd rsquare out=tree plots=den(height=rsq);
id individual;
var chest waist hip;
run;
/* Some choices for the method option:
centroid, single, complete, average, ward*/

/* The following warning is very common:


WARNING: The MAXPOINTS option value 100 is less than the number of clusters (
496). This may result in a dendrogram that is difficult to read. The
dendrogram will not be displayed. You can use the PLOTS(MAXPOINTS=)
option in the PROC CLUSTER statement to change this maximum.
To solve this, add the plots(maxpoints) option below. Choose a value
that corresponds to your sample size. */

proc cluster data=chest method=centroid nonorm ccc pseudo


rmsstd rsquare out=tree plots=den(height=rsq)
plots(maxpoints=500);
id individual;
var chest waist hip;
run;

/* Customized dendrogram */

proc tree data=tree out=PovCL nclusters=2;


run;

/* Nonhierarchical clustering analysis */

proc fastclus data=chest maxclusters=3 radius=0 replace=full maxiter=20


out=nonhierarch;
id individual;
var chest waist hip;
run;
/* OTHER CHOICES OF DISTANCE MEASURES */

/* EUCLIDEAN distances */
/* Use proc DISTANCE to calculate distances of our choice */

proc distance data=beer out=DistE method=Euclid;


var interval(price) ordinal(quality);
id brand;
run;
/* Standardized values calculated by default. If standardization is not
wanted, add the option 'nostd' on the first line. */

/* When a variable is defined as ordinal, the Euclidean distances are


automatically based on ranks */

/* NOTE: A character variable has to be used for the “id” statement. A


separate document describes how to convert character and numeric
variables. */

/* SQUARED EUCLIDEAN distances */


proc distance data=beer out=DistSqE method=sqEuclid;
var interval(price) ordinal(quality);
id brand;
run;

/* MAHALANOBIS DISTANCES – requires numerical variables */


/* First run proc princomp with the STD option to produce principal
component scores in the out= data set having an identity covariance
matrix. */

proc princomp data=beer std out=std_scores outstat=outstat noprint;


var price quality;
run;
/* ‘std’ standardizes the scores, which together now represent the
standardized multivariate summary of the variables (bivariate in this
example). The Mahalanobis distance and Euclidean distances are
equivalent for these scores */

/* Then use proc distance to get Mahalanobis distances (=Euclidean


distances, provided by default) between all possible pairs of points.*/
proc distance data=std_scores out=DistMah;
var interval(prin:);
id brand;
run;
/* GOWER SIMILARITY COEFFICIENT – can be used for a mix of variable
types */

proc distance data=beer2 out=DistGow method=dgower;


var interval(price) ordinal(quality) nominal(group);
id brand;
run;

/* Method “gower” produces similarity coefficients, while “dgower” (as


above) produces distance/dissimilarity coefficients - which is what
must be used in proc cluster. */

/* NOTE: A character variable has to be used for the “id” statement. A


separate document describes how to convert character and numeric
variables. */

/* Then use proc CLUSTER based on the distance data. The single,
complete, or average methods are recommended with Gower’s measure. */

proc cluster data=dist method=average nonorm rmsstd rsquare ;


id brand;
run;

/* proc CLUSTER squares the distances by default */


/* To use unsquared distances, e.g. when using association
coefficients, use the ‘nosquare’ option */

proc cluster data=dist method=average nonorm rmsstd rsquare nosquare;


id brand;
run;

/* FINDING WHICH INDIVIDUALS THAT BELONG TO WHICH CLUSTER */


/* Based on the centroid method, choice of 4 clusters: */

/* Save the output in a file named cluster_analysis */


proc cluster data=distance_matrix method=centroid nonorm
outtree=cluster_analysis noprint;
id brand;
run;

proc tree data=cluster_analysis ncl=3 out=clusters noprint;


copy brand; /* copy the brand variable into this data set */
run;

proc freq data=clusters;


table cluster;
run;

/* Merge this information into the original data set */


proc sort data=beer; by brand; run;
proc sort data=clusters; by brand; run;

data beer_clusters;
merge beer clusters(keep=brand cluster);
by brand;
run;

proc sgscatter data=beer_clusters;


plot quality*price/datalabel=brand group=cluster;
run;

/* Find some DESCRIPTIVE STATISTICS FOR THE CLUSTERS, based on the


variables we have information about (can include other variables than
the ones used to form the clusters */

proc means data=beer_clusters;


var price quality;
class cluster;
run;

proc boxplot data= beer_clusters;


plot price*cluster;
run;

You might also like