Data Method Nonorm CCC Pseudo RMSSTD Rsquare RSQ Id Var: Proc Cluster

/* HIERARCHICAL CLUSTER ANALYSIS */
/* EUCLIDEAN DISTANCES are used by default in proc cluster */
proc cluster data=chest method=centroid nonorm ccc pseudo

rmsstd rsquare out=tree plots=den(height=rsq);
id individual;
var chest waist hip;
run;
/* Some choices for the method option:
centroid, single, complete, average, ward*/
/* The following warning is very common:

WARNING: The MAXPOINTS option value 100 is less than the number of clusters (
496). This may result in a dendrogram that is difficult to read. The
dendrogram will not be displayed. You can use the PLOTS(MAXPOINTS=)
option in the PROC CLUSTER statement to change this maximum.
To solve this, add the plots(maxpoints) option below. Choose a value
that corresponds to your sample size. */
proc cluster data=chest method=centroid nonorm ccc pseudo

rmsstd rsquare out=tree plots=den(height=rsq)
plots(maxpoints=500);
id individual;
run;
/* Customized dendrogram */
proc tree data=tree out=PovCL nclusters=2;

run;
/* Nonhierarchical clustering analysis */
proc fastclus data=chest maxclusters=3 radius=0 replace=full maxiter=20

out=nonhierarch;
id individual;
run;
/* OTHER CHOICES OF DISTANCE MEASURES */
/* EUCLIDEAN distances */
/* Use proc DISTANCE to calculate distances of our choice */
proc distance data=beer out=DistE method=Euclid;

var interval(price) ordinal(quality);
id brand;
run;
/* Standardized values calculated by default. If standardization is not
wanted, add the option 'nostd' on the first line. */
/* When a variable is defined as ordinal, the Euclidean distances are

automatically based on ranks */
/* NOTE: A character variable has to be used for the “id” statement. A

separate document describes how to convert character and numeric
variables. */
/* SQUARED EUCLIDEAN distances */

proc distance data=beer out=DistSqE method=sqEuclid;
var interval(price) ordinal(quality);
id brand;
run;
/* MAHALANOBIS DISTANCES – requires numerical variables */

/* First run proc princomp with the STD option to produce principal
component scores in the out= data set having an identity covariance
matrix. */
proc princomp data=beer std out=std_scores outstat=outstat noprint;

var price quality;
run;
/* ‘std’ standardizes the scores, which together now represent the
standardized multivariate summary of the variables (bivariate in this
example). The Mahalanobis distance and Euclidean distances are
equivalent for these scores */
/* Then use proc distance to get Mahalanobis distances (=Euclidean

distances, provided by default) between all possible pairs of points.*/
proc distance data=std_scores out=DistMah;
var interval(prin:);
id brand;
run;
/* GOWER SIMILARITY COEFFICIENT – can be used for a mix of variable
types */
proc distance data=beer2 out=DistGow method=dgower;

var interval(price) ordinal(quality) nominal(group);
id brand;
run;
/* Method “gower” produces similarity coefficients, while “dgower” (as

above) produces distance/dissimilarity coefficients - which is what
must be used in proc cluster. */
/* NOTE: A character variable has to be used for the “id” statement. A

separate document describes how to convert character and numeric
variables. */
/* Then use proc CLUSTER based on the distance data. The single,
complete, or average methods are recommended with Gower’s measure. */
proc cluster data=dist method=average nonorm rmsstd rsquare ;

id brand;
run;
/* proc CLUSTER squares the distances by default */

/* To use unsquared distances, e.g. when using association
coefficients, use the ‘nosquare’ option */
proc cluster data=dist method=average nonorm rmsstd rsquare nosquare;

id brand;
run;
/* FINDING WHICH INDIVIDUALS THAT BELONG TO WHICH CLUSTER */

/* Based on the centroid method, choice of 4 clusters: */
/* Save the output in a file named cluster_analysis */

proc cluster data=distance_matrix method=centroid nonorm
outtree=cluster_analysis noprint;
id brand;
run;
proc tree data=cluster_analysis ncl=3 out=clusters noprint;

copy brand; /* copy the brand variable into this data set */
run;
proc freq data=clusters;

table cluster;
run;
/* Merge this information into the original data set */

proc sort data=beer; by brand; run;
proc sort data=clusters; by brand; run;
data beer_clusters;
merge beer clusters(keep=brand cluster);
by brand;
run;
proc sgscatter data=beer_clusters;

plot quality*price/datalabel=brand group=cluster;
run;
/* Find some DESCRIPTIVE STATISTICS FOR THE CLUSTERS, based on the

variables we have information about (can include other variables than
the ones used to form the clusters */
proc means data=beer_clusters;

var price quality;
class cluster;
run;
proc boxplot data= beer_clusters;

plot price*cluster;
run;

Data Method Nonorm CCC Pseudo RMSSTD Rsquare RSQ Id Var: Proc Cluster

Uploaded by

Document Information

Original Title

Copyright

Available Formats

Share this document

Share or Embed Document

Sharing Options

Did you find this document useful?

Is this content inappropriate?

Copyright:

Available Formats

Data Method Nonorm CCC Pseudo RMSSTD Rsquare RSQ Id Var: Proc Cluster

Uploaded by

Copyright:

Available Formats

/* HIERARCHICAL CLUSTER ANALYSIS */

/* EUCLIDEAN DISTANCES are used by default in proc cluster */

proc cluster data=chest method=centroid nonorm ccc pseudo

/* The following warning is very common:

proc cluster data=chest method=centroid nonorm ccc pseudo

proc tree data=tree out=PovCL nclusters=2;

/* Nonhierarchical clustering analysis */

proc fastclus data=chest maxclusters=3 radius=0 replace=full maxiter=20

proc distance data=beer out=DistE method=Euclid;

/* When a variable is defined as ordinal, the Euclidean distances are

/* NOTE: A character variable has to be used for the “id” statement. A

/* SQUARED EUCLIDEAN distances */

/* MAHALANOBIS DISTANCES – requires numerical variables */

proc princomp data=beer std out=std_scores outstat=outstat noprint;

/* Then use proc distance to get Mahalanobis distances (=Euclidean

proc distance data=beer2 out=DistGow method=dgower;

/* Method “gower” produces similarity coefficients, while “dgower” (as

/* NOTE: A character variable has to be used for the “id” statement. A

proc cluster data=dist method=average nonorm rmsstd rsquare ;

/* proc CLUSTER squares the distances by default */

proc cluster data=dist method=average nonorm rmsstd rsquare nosquare;

/* FINDING WHICH INDIVIDUALS THAT BELONG TO WHICH CLUSTER */

/* Save the output in a file named cluster_analysis */

proc tree data=cluster_analysis ncl=3 out=clusters noprint;

proc freq data=clusters;

/* Merge this information into the original data set */

proc sgscatter data=beer_clusters;

/* Find some DESCRIPTIVE STATISTICS FOR THE CLUSTERS, based on the

proc means data=beer_clusters;

proc boxplot data= beer_clusters;

You might also like