/*****************************************************************************/ /*IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. */ /*By downloading, copying, installing or using the software you agree */ /*to this license. If you do not agree to this license, do not download, */ /*install, copy or use the software. */ /* */ /* */ /*Copyright (c) 2005 Northwestern University */ /*All rights reserved. */ /*Redistribution of the software in source and binary forms, */ /*with or without modification, is permitted provided that the */ /*following conditions are met: */ /* */ /*1 Redistributions of source code must retain the above copyright */ /* notice, this list of conditions and the following disclaimer. */ /* */ /*2 Redistributions in binary form must reproduce the above copyright */ /* notice, this list of conditions and the following disclaimer in the */ /* documentation and/or other materials provided with the distribution.*/ /* */ /*3 Neither the name of Northwestern University nor the names of its */ /* contributors may be used to endorse or promote products derived */ /* from this software without specific prior written permission. */ /* */ /*THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS */ /*IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED */ /*TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, NON-INFRINGEMENT AND */ /*FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL */ /*NORTHWESTERN UNIVERSITY OR ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, */ /*INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ /*(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR */ /*SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) */ /*HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, */ /*STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN */ /*ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ /*POSSIBILITY OF SUCH DAMAGE. */ /******************************************************************************/ /*************************************************************************/ /** File: example.c **/ /** Description: Takes as input a file: **/ /** ascii file: containing 1 data point per line **/ /** binary file: first int is the number of objects **/ /** 2nd int is the no. of features of each **/ /** object **/ /** This example performs a fuzzy c-means clustering **/ /** on the data. Fuzzy clustering is performed using **/ /** min to max clusters and the clustering that gets **/ /** the best score according to a compactness and **/ /** separation criterion are returned. **/ /** Author: Wei-keng Liao **/ /** ECE Department Northwestern University **/ /** email: wkliao@ece.northwestern.edu **/ /** **/ /** Edited by: Jay Pisharath **/ /** Northwestern University. **/ /** **/ /** ================================================================ **/ /** * **/ /** Edited by: Shuai Che, David Tarjan, Sang-Ha Lee * **/ /** University of Virginia * **/ /** * **/ /** Description: No longer supports fuzzy c-means clustering; * **/ /** only regular k-means clustering. * **/ /** No longer performs "validity" function to * analyze **/ /** compactness and separation crietria; instead * **/ /** calculate root mean squared error. * **/ /** **/ /*************************************************************************/ #define _CRT_SECURE_NO_DEPRECATE 1 #include "kmeans.h" #include #include #include #include #include #include #include extern double wtime(void); /*---< usage() >------------------------------------------------------------*/ void usage(char *argv0) { char *help = "\nUsage: %s [switches] -i filename\n\n" " -i filename :file containing data to be clustered\n" " -m max_nclusters :maximum number of clusters allowed " "[default=5]\n" " -n min_nclusters :minimum number of clusters allowed " "[default=5]\n" " -t threshold :threshold value " "[default=0.001]\n" " -l nloops :iteration for each number of clusters " "[default=1]\n" " -b :input file is in binary format\n" " -r :calculate RMSE " "[default=off]\n" " -o :output cluster center coordinates " "[default=off]\n"; fprintf(stderr, help, argv0); exit(-1); } /*---< main() >-------------------------------------------------------------*/ int setup(int argc, char **argv) { int opt; extern char *optarg; char *filename = 0; float *buf; char line[1024]; int isBinaryFile = 0; float threshold = 0.001; /* default value */ int max_nclusters = 5; /* default value */ int min_nclusters = 5; /* default value */ int best_nclusters = 0; int nfeatures = 0; int npoints = 0; float len; float **features; float **cluster_centres = NULL; int i, j, index; int nloops = 1; /* default value */ int isRMSE = 0; float rmse; int isOutput = 0; // float cluster_timing, io_timing; /* obtain command line arguments and change appropriate options */ while ((opt = getopt(argc, argv, "i:t:m:n:l:bro")) != EOF) { switch (opt) { case 'i': filename = optarg; break; case 'b': isBinaryFile = 1; break; case 't': threshold = atof(optarg); break; case 'm': max_nclusters = atoi(optarg); break; case 'n': min_nclusters = atoi(optarg); break; case 'r': isRMSE = 1; break; case 'o': isOutput = 1; break; case 'l': nloops = atoi(optarg); break; case '?': usage(argv[0]); break; default: usage(argv[0]); break; } } /* ============== I/O begin ==============*/ /* get nfeatures and npoints */ // io_timing = omp_get_wtime(); /*if (isBinaryFile) { // Binary file input FILE *infile; if ((infile = fopen("100", "r")) == NULL) { fprintf(stderr, "Error: no such file (%s)\n", filename); exit(1); } fread(&npoints, 1, sizeof(int), infile); fread(&nfeatures, 1, sizeof(int), infile); // allocate space for features[][] and read attributes of all objects buf = (float *)malloc(npoints * nfeatures * sizeof(float)); features = (float **)malloc(npoints * sizeof(float *)); features[0] = (float *)malloc(npoints * nfeatures * sizeof(float)); for (i = 1; i < npoints; i++) { features[i] = features[i - 1] + nfeatures; } fread(buf, 1, npoints * nfeatures * sizeof(float), infile); fclose(infile); } else { FILE *infile; if ((infile = fopen("100", "r")) == NULL) { fprintf(stderr, "Error: no such file (%s)\n", filename); exit(1); } while (fgets(line, 1024, infile) != NULL) if (strtok(line, " \t\n") != 0) { npoints++; } rewind(infile); while (fgets(line, 1024, infile) != NULL) { if (strtok(line, " \t\n") != 0) { // ignore the id (first attribute): nfeatures = 1; while (strtok(NULL, " ,\t\n") != NULL) nfeatures++; break; } } // allocate space for features[] and read attributes of all objects buf = (float *)malloc(npoints * nfeatures * sizeof(float)); features = (float **)malloc(npoints * sizeof(float *)); features[0] = (float *)malloc(npoints * nfeatures * sizeof(float)); for (i = 1; i < npoints; i++) features[i] = features[i - 1] + nfeatures; rewind(infile); i = 0; while (fgets(line, 1024, infile) != NULL) { if (strtok(line, " \t\n") == NULL) continue; for (j = 0; j < nfeatures; j++) { buf[i] = atof(strtok(NULL, " ,\t\n")); i++; } } fclose(infile); }*/ npoints = 100; nfeatures = 100; buf = (float *)malloc(npoints * nfeatures * sizeof(float)); features = (float **)malloc(npoints * sizeof(float *)); features[0] = (float *)malloc(npoints * nfeatures * sizeof(float)); for (i = 1; i < npoints; i++) { features[i] = features[i - 1] + nfeatures; } for (i = 0; i < npoints * nfeatures; ++i) { buf[i] = (i % 64); } // io_timing = omp_get_wtime() - io_timing; printf("\nI/O completed\n"); printf("\nNumber of objects: %d\n", npoints); printf("Number of features: %d\n", nfeatures); /* ============== I/O end ==============*/ // error check for clusters if (npoints < min_nclusters) { printf("Error: min_nclusters(%d) > npoints(%d) -- cannot proceed\n", min_nclusters, npoints); exit(0); } srand(7); /* seed for future random number generator */ memcpy( features[0], buf, npoints * nfeatures * sizeof( float)); /* now features holds 2-dimensional array of features */ free(buf); /* ======================= core of the clustering ===================*/ // cluster_timing = omp_get_wtime(); /* Total clustering time */ cluster_centres = NULL; index = cluster(npoints, /* number of data points */ nfeatures, /* number of features for each point */ features, /* array: [npoints][nfeatures] */ min_nclusters, /* range of min to max number of clusters */ max_nclusters, threshold, /* loop termination factor */ &best_nclusters, /* return: number between min and max */ &cluster_centres, /* return: [best_nclusters][nfeatures] */ &rmse, /* Root Mean Squared Error */ isRMSE, /* calculate RMSE */ nloops); /* number of iteration for each number of clusters */ // cluster_timing = omp_get_wtime() - cluster_timing; /* =============== Command Line Output =============== */ /* cluster center coordinates :displayed only for when k=1*/ if ((min_nclusters == max_nclusters) && (isOutput == 1)) { printf("\n================= Centroid Coordinates =================\n"); for (i = 0; i < max_nclusters; i++) { printf("%d:", i); for (j = 0; j < nfeatures; j++) { printf(" %.2f", cluster_centres[i][j]); } printf("\n\n"); } } len = (float)((max_nclusters - min_nclusters + 1) * nloops); printf("Number of Iteration: %d\n", nloops); // printf("Time for I/O: %.5fsec\n", io_timing); // printf("Time for Entire Clustering: %.5fsec\n", cluster_timing); if (min_nclusters != max_nclusters) { if (nloops != 1) { // range of k, multiple iteration // printf("Average Clustering Time: %fsec\n", // cluster_timing / len); printf("Best number of clusters is %d\n", best_nclusters); } else { // range of k, single iteration // printf("Average Clustering Time: %fsec\n", // cluster_timing / len); printf("Best number of clusters is %d\n", best_nclusters); } } else { if (nloops != 1) { // single k, multiple iteration // printf("Average Clustering Time: %.5fsec\n", // cluster_timing / nloops); if (isRMSE) // if calculated RMSE printf("Number of trials to approach the best RMSE of %.3f is %d\n", rmse, index + 1); } else { // single k, single iteration if (isRMSE) // if calculated RMSE printf("Root Mean Squared Error: %.3f\n", rmse); } } /* free up memory */ free(features[0]); free(features); return (0); }