Hi can anyone help me convert these codes into java and let me know which classes are to have?

// Clustering.cpp : Defines the entry point for the console application.
//

#include "stdafx.h"
#include <string.h>
#include "MersenneTwister.h"

#define CATAGORICAL_CONST   30.0
#define MAX_FILENAME_SIZE   500

#define MAX_BUFFER_SIZE 500
#define MAX_STRING_SIZE 100
#define MAX_TOKENS      100

#define MAX_POPULATION  20

#define TOKEN_UNUSED            0
#define TOKEN_LINEAR_INTEGER    1
#define TOKEN_LINEAR_FLOAT      2   
#define TOKEN_NOMINAL           3

#define MAX_RECORDS 10000

#define MAX_NUMBER_OF_PEAKS     10
#define FREQUENCY_100           101

#define SAMPLE_SIZE     300

struct Field
{
    char  text[MAX_STRING_SIZE];
    float value;
};

struct Chromosome
{
    int  fitness;
    bool genes[MAX_TOKENS];
};

char InputBuffer [MAX_BUFFER_SIZE];

char Tokens[MAX_TOKENS][MAX_STRING_SIZE];

int FieldType[MAX_TOKENS];

float MinFieldType[MAX_TOKENS];
float MaxFieldType[MAX_TOKENS];

Field Records[MAX_RECORDS][MAX_TOKENS];

int   allocated_cluster[MAX_RECORDS];

bool g[MAX_TOKENS];

Chromosome population[MAX_POPULATION];
Chromosome child;

int number_of_points;
int number_of_features;

int number_of_troughs;
int troughs[MAX_NUMBER_OF_PEAKS];
int depths [MAX_NUMBER_OF_PEAKS];

int number_of_peaks;
int peaks[MAX_NUMBER_OF_PEAKS];
int heights[MAX_NUMBER_OF_PEAKS];

int total_number_of_records;

int freq_100 [FREQUENCY_100 ];

int freq_100_int[FREQUENCY_100];
int freq_100_ave[FREQUENCY_100];

int next_cluster = -1;

int cluster_sizes[MAX_RECORDS];

MTRand random;

int get_next_cluster()
{
    next_cluster++;

    return next_cluster;
}

void init_g()
{
    int i;

    for (i=0; i<MAX_TOKENS; i++)
    {
        g[i] = true;
    }
}

void initFieldType()
{
    int i;

    for (i=0; i<MAX_TOKENS; i++)
    {
        FieldType[i] = TOKEN_UNUSED;
    }
}

void initAllocatedCluster()
{
    int i;

    for (i=0; i<MAX_RECORDS; i++)
    {
        allocated_cluster[i] = -1;
    }
}

void initClusterSizes()
{
    int i;

    for (i=0; i<MAX_RECORDS; i++)
    {
        cluster_sizes[i] = 0;
    }
}

bool getline(FILE *fp, char *buffer)
{
    bool rc;
    bool collect;
    char c;
    int  i;

    rc = false;
    collect = true;

    i = 0;
    while (collect)
    {
        c = getc(fp);

        switch (c)
        {
        case EOF:
            if (i > 0)
            {
                rc = true;
            }
            collect = false;
            break;

        case '\n':
            if (i > 0)
            {
                rc = true;
                collect = false;
                buffer[i] = '\0';
            }
            break;

        default:
            buffer[i] = c;
            i++;
            break;
        }
    }

    return (rc);
}

int getTokens(char* buffer, char seperator)
{
    int i;
    int j;
    int k;

    if (buffer[0] == '\0') return 0;

    for (i=0; i<MAX_TOKENS; i++)
    {
        for (j=0; j<MAX_STRING_SIZE; j++)
        {
            Tokens[i][j] = '\0';
        }
    }

    i = 0;
    j = 0;
    k = 0;

    while (buffer[i] != '\0')
    {
        if (buffer[i] == seperator)
        {
            j++;
            k = 0;
        }
        else
        {
            Tokens[j][k] = buffer[i];
            k++;
        }

        i++;
    }

    return j+1;
}

bool isInteger(char* string)
{
    int size;
    int i;

    size = strlen(string);

    if (size > 0)
    {
        if (string[0] == '?') return false;

        for (i=0; i<size; i++)
        {
            if ((string[i] < '0') || (string[i] > '9'))
            {
                return false;
            }
        }
    }
    else
    {
        return false;
    }

    return true;
}

bool isFloat(char* string)
{
    int number_of_digits = 0;
    int number_of_dots = 0;
    int size;
    int i;

    size = strlen(string);

    if (size > 0)
    {
        if (string[0] == '?') return false;

        for (i=0; i<size; i++)
        {
            switch (string[i])
            {
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9':
                number_of_digits++;
                break;

            case '.':
                number_of_dots++;
                break;

            default:
                return false;
                break;
            }
        }
    }
    else
    {
        return false;
    }

    if ((number_of_dots == 1) && (number_of_digits > 0))
    {
        return true;
    }
    else
    {
        return false;
    }
}

bool isText(char* string)
{
    int size;
    int i;

    size = strlen(string);

    if (size > 0)
    {
        if (string[0] == '?') return false;

        for (i=0; i<size; i++)
        {
            if (((string[i] >= 'a') && (string[i] <= 'z')) || ((string[i] >= 'A') && (string[i] <= 'Z')))
            {
                return true;
            }
        }
    }
    else
    {
        return false;
    }

    return false;
}

void setFieldType(char* filename, bool titles_present)
{
    FILE *ifp;
    int  number_of_lines = 0;
    int  number_of_tokens;
    int  i;

    if ( ( ifp = fopen( filename, "r" ) ) != NULL ) 
    {
        while (getline(ifp, InputBuffer))
        {
            if ((!titles_present) || (number_of_lines > 0))
            {
                number_of_tokens = getTokens(InputBuffer, ',');

                if (number_of_tokens > 0)
                {
                    for (i=0; i<number_of_tokens; i++)
                    {
                        switch (FieldType[i])
                        {
                        case TOKEN_UNUSED:
                            if (isText(Tokens[i])) FieldType[i]    = TOKEN_NOMINAL;
                            if (isInteger(Tokens[i])) FieldType[i] = TOKEN_LINEAR_INTEGER;
                            if (isFloat(Tokens[i])) FieldType[i]   = TOKEN_LINEAR_FLOAT;
                            break;

                        case TOKEN_LINEAR_INTEGER:
                            if (isText(Tokens[i])) FieldType[i]  = TOKEN_NOMINAL;
                            if (isFloat(Tokens[i])) FieldType[i] = TOKEN_LINEAR_FLOAT;
                            break;

                        case TOKEN_LINEAR_FLOAT:
                            if (isText(Tokens[i])) FieldType[i] = TOKEN_NOMINAL;
                            break;

                        case TOKEN_NOMINAL:
                            break;

                        default:
                            break;
                        }
                    }
                }
            }
            number_of_lines++;
        }
        fclose(ifp);
    }
    else
    {
        printf("\n");
        printf("\n  ABORTING >> Data File %s not found.  ", filename);
        printf("\n");
    }
}

void setMinMax(char* filename, bool titles_present)
{
    FILE *ifp;
    bool init_min_max = true;
    int  number_of_lines = 0;
    int  number_of_tokens;
    int  i;
    float value;
    long  d_value;

    if ( ( ifp = fopen( filename, "r" ) ) != NULL ) 
    {
        while (getline(ifp, InputBuffer))
        {
            if ((!titles_present) || (number_of_lines > 0))
            {
                number_of_tokens = getTokens(InputBuffer, ',');

                if (number_of_tokens > 0)
                {
                    for (i=0; i<number_of_tokens; i++)
                    {
                        switch (FieldType[i])
                        {
                        case TOKEN_UNUSED:
                            break;

                        case TOKEN_LINEAR_INTEGER:
                            if (sscanf(Tokens[i], "%d", &d_value) == 1)
                            {
                                value = (float)d_value;

                                if (init_min_max)
                                {
                                    MinFieldType[i] = value;
                                    MaxFieldType[i] = value;
                                }
                                else
                                {
                                    if (MinFieldType[i] > value) MinFieldType[i] = value;
                                    if (MaxFieldType[i] < value) MaxFieldType[i] = value;
                                }
                            }
                            break;

                        case TOKEN_LINEAR_FLOAT:
                            if (sscanf(Tokens[i], "%f", &value) == 1)
                            {
                                if (init_min_max)
                                {
                                    MinFieldType[i] = value;
                                    MaxFieldType[i] = value;
                                }
                                else
                                {
                                    if (MinFieldType[i] > value) MinFieldType[i] = value;
                                    if (MaxFieldType[i] < value) MaxFieldType[i] = value;
                                }
                            }
                            break;

                        case TOKEN_NOMINAL:
                            break;

                        default:
                            break;
                        }
                    }
                }
                init_min_max = false;
            }
            number_of_lines++;
        }
        fclose(ifp);

        // Cater for 0..1 being a nominal value

        for (i=0; i<number_of_tokens; i++)
        {
            switch (FieldType[i])
            {
            case TOKEN_UNUSED:
                break;

            case TOKEN_LINEAR_INTEGER:
                if ((MinFieldType[i] == 0) && (MaxFieldType[i] == 1))
                {
                    FieldType[i] = TOKEN_NOMINAL;
                }
                break;

            case TOKEN_LINEAR_FLOAT:
                break;

            case TOKEN_NOMINAL:
                break;

            default:
                break;
            }
        }
    }
    else
    {
        printf("\n");
        printf("\n  ABORTING >> Data File %s not found.  ", filename);
        printf("\n");
    }
}

int printFieldType()
{
    int number_of_fields = 0;
    int i;

    printf("\n");

    for (i=0; i<MAX_TOKENS; i++)
    {
        switch (FieldType[i])
        {
        case TOKEN_UNUSED:
            break;

        case TOKEN_LINEAR_INTEGER:
            printf("Field %d is LINEAR (%d ... %d)\n", i, (long)MinFieldType[i], (long)MaxFieldType[i]);
            number_of_fields++;
            break;

        case TOKEN_LINEAR_FLOAT:
            printf("Field %d is LINEAR (%f ... %f)\n", i, MinFieldType[i], MaxFieldType[i]);
            number_of_fields++;
            break;

        case TOKEN_NOMINAL:
            printf("Field %d is NOMINAL\n", i);
            number_of_fields++;
            break;

        default:
            break;
        }
    }
    printf("\n");
    printf("There are %d fields\n", number_of_fields);
    printf("\n");

    return number_of_fields;
}

int readNumberOfRecords(char* filename, bool titles_present)
{
    FILE *ifp;
    int  number_of_lines = 0;
    int  number_of_records = 0;
    int  number_of_tokens;
    int  i;
    float f_value;
    long  d_value;
    bool  record_okay;

    if ( ( ifp = fopen( filename, "r" ) ) != NULL ) 
    {
        printf("\n");
        printf("Reading Number Of Records in %s\n", filename);

        while (getline(ifp, InputBuffer))
        {
            if ((!titles_present) || (number_of_lines > 0))
            {
                number_of_tokens = getTokens(InputBuffer, ',');

                if (number_of_tokens > 0)
                {
                    record_okay = true;

                    for (i=0; i<number_of_tokens; i++)
                    {
                        if (strcmp(Tokens[i], "?") == 0) record_okay = false;

                        strcpy(Records[number_of_records][i].text, Tokens[i]);

                        switch (FieldType[i])
                        {
                        case TOKEN_UNUSED:
                            break;

                        case TOKEN_LINEAR_INTEGER:
                            if (sscanf(Tokens[i], "%d", &d_value) != 1)
                            {
                                record_okay = false;
                            }
                            break;

                        case TOKEN_LINEAR_FLOAT:
                            if (sscanf(Tokens[i], "%f", &f_value) != 1)
                            {
                                record_okay = false;
                            }
                            break;

                        case TOKEN_NOMINAL:
                            break;

                        default:
                            break;
                        }
                    }
                    if (record_okay) number_of_records++;
                }
            }
            number_of_lines++;
        }
        fclose(ifp);

        printf("... %d records read\n", number_of_records);
        printf("\n");
    }
    else
    {
        printf("\n");
        printf("\n  ABORTING >> Data File %s not found.  ", filename);
        printf("\n");
    }

    return number_of_records;
}

int readSampleData(char* filename, bool titles_present)
{
    FILE *ifp;
    int  number_of_lines = 0;
    int  number_of_records = 0;
    int  number_of_tokens;
    int  i;
    float f_value;
    long  d_value;
    bool  record_okay;
    double sample_limit;

    sample_limit = (double)SAMPLE_SIZE / (double)total_number_of_records;

    if ( ( ifp = fopen( filename, "r" ) ) != NULL ) 
    {
        printf("\n");
        printf("Reading sample data from %s\n", filename);

        while (getline(ifp, InputBuffer))
        {
            if ((!titles_present) || (number_of_lines > 0))
            {
                number_of_tokens = getTokens(InputBuffer, ',');

                if (number_of_tokens > 0)
                {
                    record_okay = true;

                    for (i=0; i<number_of_tokens; i++)
                    {
                        if (strcmp(Tokens[i], "?") == 0) record_okay = false;

                        strcpy(Records[number_of_records][i].text, Tokens[i]);

                        switch (FieldType[i])
                        {
                        case TOKEN_UNUSED:
                            break;

                        case TOKEN_LINEAR_INTEGER:
                            if (sscanf(Tokens[i], "%d", &d_value) == 1)
                            {
                                f_value = (float)d_value;
                                Records[number_of_records][i].value = f_value;
                            }
                            else
                            {
                                record_okay = false;
                            }
                            break;

                        case TOKEN_LINEAR_FLOAT:
                            if (sscanf(Tokens[i], "%f", &f_value) == 1)
                            {
                                Records[number_of_records][i].value = f_value;
                            }
                            else
                            {
                                record_okay = false;
                            }
                            break;

                        case TOKEN_NOMINAL:
                            break;

                        default:
                            break;
                        }
                    }
                    if (record_okay) 
                    {
                        if (random() < sample_limit)
                        {
                            number_of_records++;
                        }
                    }
                }
            }
            number_of_lines++;
        }
        fclose(ifp);

        printf("... %d records read\n", number_of_records);
        printf("\n");
    }
    else
    {
        printf("\n");
        printf("\n  ABORTING >> Data File %s not found.  ", filename);
        printf("\n");
    }

    return number_of_records;
}

int readData(char* filename, bool titles_present)
{
    FILE *ifp;
    int  number_of_lines = 0;
    int  number_of_records = 0;
    int  number_of_tokens;
    int  i;
    float f_value;
    long  d_value;
    bool  record_okay;

    if ( ( ifp = fopen( filename, "r" ) ) != NULL ) 
    {
        printf("\n");
        printf("Reading %s\n", filename);

        if (number_of_records >= MAX_RECORDS) return MAX_RECORDS;  // Safety check

        while (getline(ifp, InputBuffer))
        {
            if ((!titles_present) || (number_of_lines > 0))
            {
                number_of_tokens = getTokens(InputBuffer, ',');

                if (number_of_tokens > 0)
                {
                    record_okay = true;

                    for (i=0; i<number_of_tokens; i++)
                    {
                        if (strcmp(Tokens[i], "?") == 0) record_okay = false;

                        strcpy(Records[number_of_records][i].text, Tokens[i]);

                        switch (FieldType[i])
                        {
                        case TOKEN_UNUSED:
                            break;

                        case TOKEN_LINEAR_INTEGER:
                            if (sscanf(Tokens[i], "%d", &d_value) == 1)
                            {
                                f_value = (float)d_value;
                                Records[number_of_records][i].value = f_value;
                            }
                            else
                            {
                                record_okay = false;
                            }
                            break;

                        case TOKEN_LINEAR_FLOAT:
                            if (sscanf(Tokens[i], "%f", &f_value) == 1)
                            {
                                Records[number_of_records][i].value = f_value;
                            }
                            else
                            {
                                record_okay = false;
                            }
                            break;

                        case TOKEN_NOMINAL:
                            break;

                        default:
                            break;
                        }
                    }
                    if (record_okay) number_of_records++;
                }
            }
            number_of_lines++;
        }
        fclose(ifp);

        printf("... %d records read\n", number_of_records);
        printf("\n");
    }
    else
    {
        printf("\n");
        printf("\n  ABORTING >> Data File %s not found.  ", filename);
        printf("\n");
    }

    return number_of_records;
}

int map100(float value, float min_value, float max_value)
{
    int map_value;

    map_value = (int)((100.0 * value) / (max_value - min_value));
//  map_value = (int)((100.0 * (value - min_value)) / (max_value - min_value));

    return map_value;
}

float dissimilarity_linear(float a, float b, float min_value, float max_value)
{
    float diss;

    if (a > b)
    {
        diss = (float)map100((a - b), min_value, max_value);

        if (diss < 0.0)
        {
            //printf("dissimilarity_linear(a > b): a = %f, b = %f, min_value = %f, max_value = %f\n", a, b, min_value, max_value);
            return 100;
        }
        else
        {
            return diss;
        }
    }
    else
    {
        diss = (float)map100((b - a), min_value, max_value);

        if (diss < 0.0)
        {
            //printf("dissimilarity_linear(a <= b): a = %f, b = %f, min_value = %f, max_value = %f\n", a, b, min_value, max_value);
            return 100;
        }
        else
        {
            return diss;
        }
    }
}

float dissimilarity_catagorical(char* a, char* b)
{
    if (strcmp(a,b) == 0)
    {
        return 0.0;
    }
    else
    {
        return CATAGORICAL_CONST;
    }
}

int distance(bool* g, int a, int b)
{
    int i;
    float sum_of_values = 0.0;
    float sum_of_mins = 0.0;
    float sum_of_maxs = 0.0;
    float dist;

    for (i=0; i<number_of_features; i++)
    {
        if (g[i])
        {
            switch (FieldType[i])
            {
            case TOKEN_UNUSED:
                break;

            case TOKEN_LINEAR_INTEGER:
            case TOKEN_LINEAR_FLOAT:
                //printf("distance: a = %d, b = %d, i = %d\n", a, b, i);
                sum_of_values = sum_of_values + dissimilarity_linear(Records[a][i].value, Records[b][i].value, MinFieldType[i], MaxFieldType[i]);
                sum_of_maxs = sum_of_maxs + 100.0;
                break;

            case TOKEN_NOMINAL:
                sum_of_values = sum_of_values + dissimilarity_catagorical(Records[a][i].text, Records[b][i].text);
                sum_of_maxs = sum_of_maxs + CATAGORICAL_CONST;
                break;

            default:
                break;
            }
        }
    }

    if (sum_of_maxs > 0.0)
    {
        dist = map100(sum_of_values, sum_of_mins, sum_of_maxs);

        if (dist >= 0.0)
        {
            return dist;
        }
        else
        {
            //printf("distance:  number_of_features = %d\n", number_of_features);
            //printf("distance:  dist = %d, sum_of_values = %d, sum_of_mins= %d, sum_of_maxs= %d\n", dist, sum_of_values, sum_of_mins, sum_of_maxs);
            return 100;
        }
    }
    else
    {
        return 100;
    }
}

void get_freq_100(bool* g)
{
    double max;
    int i;
    int j;
    int index;

    for (i=0; i<FREQUENCY_100; i++)
    {
        freq_100[i] = 0;
    }

    for (i=0; i<number_of_points; i++)
    {
        for (j=0; j<number_of_points; j++)
        {
            if (i < j)
            {
                index = distance(g, i, j);
                if ((index >= 0) && (index <= 100))
                {
                    freq_100[index]++;
                }
                else
                {
                    //printf("freq_100: index = %d\n", index);
                }
            }
        }
    }
}

void get_freq_100_ave()
{
    int i;
    int j;
    int index;
    int count;

    // Intermediate averages

    for (i=0; i<FREQUENCY_100; i++)
    {
        count = 0;
        freq_100_int[i] = 0;

        for (j=-4; j<=4; j++)
        {
            index = i + j;

            if ((index >= 0) && (index < FREQUENCY_100))
            {
                freq_100_int[i] = freq_100_int[i] + freq_100[index];
                count++;
            }
        }
        freq_100_int[i] = freq_100_int[i] / count;
    }

    // Full averages

    for (i=0; i<FREQUENCY_100; i++)
    {
        count = 0;
        freq_100_ave[i] = 0;

        for (j=-4; j<=4; j++)
        {
            index = i + j;

            if ((index >= 0) && (index < FREQUENCY_100))
            {
                freq_100_ave[i] = freq_100_ave[i] + freq_100_int[index];
                count++;
            }
        }
        freq_100_ave[i] = freq_100_ave[i] / count;
    }
}

int count_peaks()
{
    int i;

    number_of_peaks = 0;

    for (i=0; i<MAX_NUMBER_OF_PEAKS; i++)
    {
        peaks[i] = 0;
        heights[i] = 0;
    }

    for (i=2; i<FREQUENCY_100-2; i++)
    {
        if ((freq_100_ave[i-2] <  freq_100_ave[i-1])
        &&  (freq_100_ave[i-1] <= freq_100_ave[i-0])
        &&  (freq_100_ave[i+2] <= freq_100_ave[i+1])
        &&  (freq_100_ave[i+1] <  freq_100_ave[i+0]))
        {
            if (number_of_peaks < MAX_NUMBER_OF_PEAKS)
            {
                peaks[number_of_peaks] = i;
                heights[number_of_peaks] = freq_100_ave[i];
            }
            number_of_peaks++;
        }
    }

    return number_of_peaks;
}

int count_troughs()
{
    int i;

    number_of_troughs = 0;

    for (i=0; i<MAX_NUMBER_OF_PEAKS; i++)
    {
        troughs[i] = 0;
        depths[i] = 0;
    }

    for (i=2; i<FREQUENCY_100-3; i++)
    {
        if (((freq_100_ave[i-2] > freq_100_ave[i-1]) && (freq_100_ave[i-1] >= freq_100_ave[i-0]) && (freq_100_ave[i+2] > freq_100_ave[i+1]) && (freq_100_ave[i+1] >= freq_100_ave[i+0]))
        ||  ((freq_100_ave[i-2] > freq_100_ave[i-1]) && (freq_100_ave[i-1] >= freq_100_ave[i-0]) && (freq_100_ave[i+3] > freq_100_ave[i+2]) && (freq_100_ave[i+2] >= freq_100_ave[i+1]))
        ||  ((freq_100_ave[i-2] > freq_100_ave[i-1]) && (freq_100_ave[i-1] >= freq_100_ave[i-0]) && (freq_100_ave[i+4] > freq_100_ave[i+3]) && (freq_100_ave[i+2] >= freq_100_ave[i+1]))
        ||  ((freq_100_ave[i-1] > 0) &&  (freq_100_ave[i-0] == 0)))

        {
            if (number_of_troughs < MAX_NUMBER_OF_PEAKS)
            {
                if (number_of_troughs > 0)
                {
                    if (troughs[number_of_troughs-1] == (i-1)) number_of_troughs--;
                }
                troughs[number_of_troughs] = i;
                depths[number_of_troughs] = freq_100_ave[i];
            }
            number_of_troughs++;
        }
    }

    return number_of_troughs;
}

void save_freq_100(char* outfile)
{
    FILE  *ofp;
    int i;

    if ( ( ofp = fopen( outfile, "w" ) ) != NULL ) 
    {
        fprintf(ofp, "Frequency\n");
        for (i=0; i<FREQUENCY_100; i++)
        {
            fprintf(ofp, "%d\n", freq_100[i]);
        }
        fclose(ofp);
    }
}

void save_freq_100_ave(char* outfile)
{
    FILE  *ofp;
    int i;

    if ( ( ofp = fopen( outfile, "w" ) ) != NULL ) 
    {
        fprintf(ofp, "AverageFrequency\n");
        for (i=0; i<FREQUENCY_100; i++)
        {
            fprintf(ofp, "%d\n", freq_100_ave[i]);
        }
        fclose(ofp);
    }
}

void clusterData(bool* g, int number_of_records, int clustering_distance)
{
    int i;
    int j;
    int k;
    int current_cluster;
    int new_cluster;

    printf("\n");
    printf("Clustering\n");
    printf("\n");

    for (i=0; i<number_of_records; i++)
    {
        if (allocated_cluster[i] == -1)  // not allocated
        {
            current_cluster = get_next_cluster();
            allocated_cluster[i] = current_cluster;
        }
        else
        {
            current_cluster = allocated_cluster[i];
        }

        for (j=i+1; j<number_of_records; j++)
        {
            if (distance(g, i, j) < clustering_distance)
            {
                if (allocated_cluster[j] == -1)  // not allocated
                {
                    allocated_cluster[j] = current_cluster;
                }
                else
                {
                    // merge clusters

                    new_cluster = allocated_cluster[j];

                    for (k=0; k<number_of_records; k++)
                    {
                        if (allocated_cluster[k] == current_cluster) allocated_cluster[k] = new_cluster;
                    }

                    current_cluster = new_cluster;
                }
            }
        }
    }
}

void setClusterSizes()
{
    int i;
    int current_cluster;

    for (i=0; i<MAX_RECORDS; i++)
    {
        current_cluster = allocated_cluster[i];

        if ((current_cluster >= 0) && (current_cluster < MAX_RECORDS))
        {
            cluster_sizes[current_cluster]++;
        }
    }
}

void reportClusters(FILE* rfp)
{
    int i;

    for (i=0; i<MAX_RECORDS; i++)
    {
        if (cluster_sizes[i] > 0)
        {
            printf("Cluster %d has %d members\n", i, cluster_sizes[i]);
            fprintf(rfp, "Cluster %d has %d members\n", i, cluster_sizes[i]);
        }
    }
}

int fitness(bool* genes)
{
    int i;
    int size = -1;

    for (i=0; i<MAX_TOKENS; i++)
    {
        if (genes[i])
        {
            if (FieldType[i] != TOKEN_UNUSED)
            {
                size++;  // add a penalty for having redundant genes being present
            }
        }
    }

    printf("\n");
    printf("--------------------------\n");
    printf("\n");
    get_freq_100(genes);
    get_freq_100_ave();
    number_of_peaks = count_peaks();
    printf("Peaks = %d\n", number_of_peaks);
    for (i=0; i<MAX_NUMBER_OF_PEAKS; i++)
    {
        if (peaks[i] != 0) printf("%d (%d)  ", peaks[i], heights[i]);
    }
    printf("\n");
    printf("\n");
    number_of_troughs = count_troughs();
    printf("Troughs = %d\n", number_of_troughs);
    for (i=0; i<MAX_NUMBER_OF_PEAKS; i++)
    {
        if (troughs[i] != 0) printf("%d (%d)  ", troughs[i], depths[i]);
    }
    printf("\n");
    printf("\n");

//  if (number_of_troughs >= 1)
//  if ((number_of_peaks >= 1) && (number_of_troughs >= 1) && (number_of_peaks < 4) && (number_of_troughs < 3))
    if ((number_of_peaks > 1) && (number_of_troughs >= 1))
    {
        if (troughs[0] >= 3)
        {
            //printf("Fitness = %d\n\n", troughs[0]);
            //return troughs[0];
            printf("Fitness = %d\n\n", troughs[0] + size);
            return troughs[0] + size;
            //printf("Fitness = %d\n\n", depths[0] + size);
            //return depths[0] + size;
        }
        else 
        {
            printf("Fitness = 1000000\n\n");
            return 1000000;
        }
    }
    else
    {
        printf("Fitness = 1000000\n\n");
        return 1000000;
    }
}

void initPopulation()
{
    int i;
    int j;

    for (i=0; i<MAX_POPULATION; i++)
    {
        population[i].fitness = 1000;

        for (j=0; j<MAX_TOKENS; j++)
        {
            if (g[j])
            {
                if (random() < 0.5)
                {
                    population[i].genes[j] = true;
                }
                else
                {
                    population[i].genes[j] = false;
                }
            }
            else
            {
                population[i].genes[j] = false;
            }
        }

        population[i].fitness = fitness(population[i].genes);
    }
}

int select()
{
    int i = MAX_POPULATION;

    while (i >= MAX_POPULATION)
    {
        i = (int)(random() * MAX_POPULATION);
    }

    return i;
}

void crossover(int a, int b)
{
    int i;

    for (i=0; i<MAX_TOKENS; i++)
    {
        if (random() < 0.5)
        {
            child.genes[i] = population[a].genes[i];
        }
        else
        {
            child.genes[i] = population[b].genes[i];
        }
    }
}

void mutation(int number_of_fields)
{
    int index;

    index = (int)(random() * number_of_fields);

    if (random() < 0.5)
    {
        if (g[index])
        {
            child.genes[index] = !child.genes[index];
        }
        else
        {
            child.genes[index] = false;
        }
    }
}

void replace()
{
    int i;
    int j;

    child.fitness = fitness(child.genes);

    i = select();

    //if (i >= MAX_POPULATION) i = 0;

    if (child.fitness < population[i].fitness)  // replace population member with fitter child
    {
        for (j=0; j<MAX_TOKENS; j++)
        {
            population[i].genes[j] = child.genes[j];
        }

        population[i].fitness = child.fitness;
    }
}

int getBest()
{
    int i;
    int best = 0;
    int best_value = 1000;

    for (i=0; i<MAX_POPULATION; i++)
    {
        if (population[i].fitness < best_value)
        {
            best = i;
            best_value = population[i].fitness;
        }
    }

    return best;
}

void saveClusters(char* file, int number_of_records)
{
    char cstfile[MAX_STRING_SIZE];
    FILE* cfp;
    int i;
    int j;
    int k;

    for (i=0; i<number_of_records; i++)
    {
        if (cluster_sizes[i] > 0)
        {
            sprintf(cstfile, "%s_%d.txt", file, i);

            if ( ( cfp = fopen( cstfile, "w" ) ) != NULL ) 
            {
                for (j=0; j<number_of_records; j++)
                {
                    if (allocated_cluster[j] == i)
                    {
                        for (k=0; k<MAX_TOKENS; k++)
                        {
                            if (FieldType[k] != TOKEN_UNUSED)
                            {
                                if (k == 0)
                                {
                                    fprintf(cfp, "%s", Records[j][k].text);
                                }
                                else
                                {
                                    fprintf(cfp, ",%s", Records[j][k].text);
                                }
                            }
                        }
                        fprintf(cfp, "\n");
                    }
                }

                fclose(cfp);
            }
            else
            {
                printf("\n");
                printf("\n  ABORTING >> Output File %s not openned.  ", cstfile);
                printf("\n");
            }
        }
    }
}

char attribs[20][MAX_STRING_SIZE];
int  next_attribute;

void addAttribute(char* attribute)
{
    int i;
    bool attribute_found = false;

    for (i=0; i<next_attribute; i++)
    {
        if (strcmp(attribs[i], attribute) == 0) attribute_found = true;
    }

    if (!attribute_found)
    {
        strcpy(attribs[next_attribute], attribute);
        next_attribute++;
    }
}

void analyseClusters(FILE* rfp, bool* g)
{
    int i;
    int j;
    int k;
    bool  attribute_found;
    float min;
    float max;
    int count;

    for (i=0; i<number_of_points; i++)
    {
        if (cluster_sizes[i] > 2)
        {
            fprintf(rfp, "Cluster %d has %d members\n", i, cluster_sizes[i]);

            for (j=0; j<MAX_TOKENS; j++)
            {
                next_attribute = 0;

                if (g[j])
                {
                    switch (FieldType[j])
                    {
                    case TOKEN_UNUSED:
                        break;

                    case TOKEN_LINEAR_INTEGER:
                    case TOKEN_LINEAR_FLOAT:
                        max = MinFieldType[j];
                        min = MaxFieldType[j];

                        for (k=0; k<number_of_points; k++)
                        {
                            if (i == allocated_cluster[k])
                            {
                                if (max < Records[k][j].value) max = Records[k][j].value;
                                if (min > Records[k][j].value) min = Records[k][j].value;
                            }
                        }
                        fprintf(rfp, "feature = %d, min = %f, max = %f\n\n", j, min, max);
                        break;

                    case TOKEN_NOMINAL:
                        next_attribute = 0;
                        count = 0;

                        for (k=0; k<number_of_points; k++)
                        {
                            if (i == allocated_cluster[k])
                            {
                                addAttribute(Records[k][j].text);
                            }
                        }

                        fprintf(rfp, "feature = %d, ", j);
                        for (k=0; k<next_attribute; k++)
                        {
                            fprintf(rfp, "%s, ", attribs[k]);
                        }
                        fprintf(rfp, "\n\n");
                        break;

                    default:
                        break;
                    }
                }
            }
        }
    }
}

void reduceOutliers(bool* g, int merge_distance)
{
    int i;
    int j;
    int cluster_i;
    int cluster_j;
    int nearest_cluster;
    int cluster_distance;
    int nearest_distance;
    int count = 0;
    int outliers = 0;

    for (i=0; i<number_of_points; i++)
    {
        nearest_distance = 100;
        cluster_i = allocated_cluster[i];

        if (cluster_sizes[cluster_i] <= 3)
        {
            outliers++;

            // Can we merge it to a nearby cluster?

            for (j=0; j<number_of_points; j++)
            {
                if (i != j)
                {
                    cluster_j = allocated_cluster[j];

                    if (cluster_sizes[cluster_j] > 3)
                    {
                        cluster_distance = distance(g, i, j);
                        //printf("cluster_distance = %d\n", cluster_distance);

                        if (cluster_distance < nearest_distance)
                        {
                            nearest_distance = cluster_distance;
                            nearest_cluster  = cluster_j;
                        }
                    }
                }
            }

            if (nearest_distance < merge_distance)
            {
                allocated_cluster[i] = nearest_cluster;
                cluster_sizes[cluster_i]--;
                cluster_sizes[nearest_cluster]++;
                count++;
            }
        }
    }

    //printf("%d outliers merged = %d\n", outliers, count);
    //getchar();
}

void displayGene(bool* g, int number_of_features)
{
    int i;

    printf("Genes = ");
    for (i=0; i<number_of_features; i++)
    {
        if (g[i])
        {
            printf("T ");
        }
        else
        {
            printf("F ");
        }
    }
    printf("\n\n");
}

void clusterFile(char* file, int ignore)
{
    int  number_of_records;
    int  number_of_fields;
    int  number_of_peaks;
    int  number_of_troughs;
    int  clustering_distance;
    int  i;
    int  best;
    char infile [MAX_STRING_SIZE];
    char rptfile[MAX_STRING_SIZE];
    FILE* rfp;

    next_cluster = -1;

    init_g();

    if ((ignore >= 0) && (ignore < MAX_TOKENS))
    {
        g[ignore] = false;
    }

    sprintf(infile, "%s.txt", file);
    sprintf(rptfile, "%s_report.txt", file);

    if ( ( rfp = fopen( rptfile, "w" ) ) != NULL )
    {
        fprintf(rfp, "Clustering '%s'\n\n", infile);

        initFieldType();
        setFieldType(infile, false);
        setMinMax(infile, false);
        total_number_of_records = readNumberOfRecords(infile, false);
        number_of_records = readSampleData(infile, false);
//      number_of_records = readData(infile, false);
        number_of_points  = number_of_records;
        fprintf(rfp, "%d records read\n", number_of_records);

        number_of_fields  = printFieldType();
        number_of_features = number_of_fields;
        fprintf(rfp, "Each record contains %d fields\n", number_of_fields);
        fprintf(rfp, "\n");

        //getchar();

        initPopulation();
        for (i=0; i<50000; i++)
        {
            crossover(select(), select());
            mutation(number_of_fields);
            replace();
        }

        printf("\n");
        printf("--------------------------\n");
        printf("\n");

        best = getBest();

        printf("\n");
        printf("Best member of the population = %d\n", best);
        printf("\n");

        displayGene(population[best].genes, number_of_fields);
        //getchar();

        printf("\n");
        printf("--------------------------\n");
        printf("\n");
        get_freq_100(population[best].genes);
        get_freq_100_ave();
        save_freq_100("freq100.csv");
        save_freq_100_ave("freq100_ave.csv");
        number_of_peaks = count_peaks();
        printf("Peaks = %d\n", number_of_peaks);
        for (i=0; i<MAX_NUMBER_OF_PEAKS; i++)
        {
            if (peaks[i] != 0) printf("%d (%d)  ", peaks[i], heights[i]);
        }
        printf("\n");
        printf("\n");
        number_of_troughs = count_troughs();
        printf("Troughs = %d\n", number_of_troughs);
        for (i=0; i<MAX_NUMBER_OF_PEAKS; i++)
        {
            if (troughs[i] != 0) printf("%d (%d)  ", troughs[i], depths[i]);
        }

        printf("\n\n");

        if ((number_of_peaks >= 1) && (number_of_troughs >= 1))
        {
            clustering_distance = troughs[0] /2;
            if (clustering_distance < 3) clustering_distance = 3;

            printf("\n");
            printf("Clustering has been found\n");
            fprintf(rfp, "Clustering has been found\n");
            printf("The clustering distance used to identify clusters = %d\n", clustering_distance);
            fprintf(rfp, "The clustering distance used to identify clusters = %d\n", clustering_distance);
            for (i=0; i<number_of_fields; i++)
            {
                if (population[best].genes[i])
                {
                    printf("Clustering using field %d\n", i);
                    fprintf(rfp, "Clustering using field %d\n", i);
                }
            }
            initAllocatedCluster();
            initClusterSizes();
            // Need to read in the real data!!!!
            number_of_records = readData(infile, false);
            number_of_points  = number_of_records;
            clusterData(population[best].genes, number_of_records, clustering_distance);
            setClusterSizes();
            reduceOutliers(population[best].genes, 10);
            reportClusters(rfp);
            analyseClusters(rfp, population[best].genes);
            saveClusters(file, number_of_records);
            printf("\n");
            fprintf(rfp, "\n");
        }
        else
        {
            printf("\n");
            fprintf(rfp, "\n");
            printf("Clustering has NOT been found\n");
            fprintf(rfp, "Clustering has NOT been found\n");
            printf("\n");
        }

        fclose(rfp);
    }
}

int _tmain(int argc, _TCHAR* argv[])
{

    random.seed(9);

    //clusterFile("Iris", 4);
    //clusterFile("Iris_0", 4);
    //clusterFile("Iris_3", 4);

    //clusterFile("Breast Cancer Wisconsin", 10);

    //clusterFile("Dermatology", 34);
    //clusterFile("Dermatology_0", 34);
    //clusterFile("Dermatology_5", 34);
    //clusterFile("Dermatology_7", 34);
    //clusterFile("Dermatology_13", 34);

    //clusterFile("Seeds", 7);
    //clusterFile("Seeds_0", 7);
    //clusterFile("Seeds_2", 7);
    //clusterFile("Seeds_3", 7);

    clusterFile("Mushroom", 0);

    //clusterFile("Hepatitis Domain", 0);

    printf("\n");
    printf("Press any Key to exit> ");
    getchar();
    printf("\n");

    return 0;
}

/*

    init_g();
    //g[4] = false;

    initFieldType();
    //setFieldType("Iris.txt", false);
    //setMinMax("Iris.txt", false);
    //number_of_records = readData("Iris.txt", false);
    setFieldType("Mushroom.txt", true);
    setMinMax("Mushroom.txt", true);
    number_of_records = readData("Mushroom.txt", true);
    number_of_points  = number_of_records;
    number_of_fields  = printFieldType();

    initPopulation();
    for (i=0; i<200; i++)
    {
        crossover(select(), select());
        mutation(number_of_fields);
        replace();
    }

    printf("\n");
    printf("--------------------------\n");
    printf("\n");

    best = getBest();

    printf("\n");
    printf("Best member of the population = %d\n", best);
    printf("\n");

    printf("\n");
    printf("--------------------------\n");
    printf("\n");
    get_freq_100(population[best].genes);
    get_freq_100_ave();
    printf("Peaks=%d\n", count_peaks());
    for (i=0; i<MAX_NUMBER_OF_PEAKS; i++)
    {
        if (peaks[i] != 0) printf("%d (%d)  ", peaks[i], heights[i]);
    }
    printf("\n");
    number_of_peaks = count_peaks();
    if (2 == number_of_peaks)
    {
        printf("\n");
        //printf("Initial clustering distance = %f\n", (double)peaks[0] * max_distance() / (2.0 * 100.0));
    }
    number_of_troughs = count_troughs();
    printf("Troughs=%d\n", number_of_troughs);
    for (i=0; i<MAX_NUMBER_OF_PEAKS; i++)
    {
        if (troughs[i] != 0) printf("%d (%d)  ", troughs[i], depths[i]);
    }

    printf("\n\n");
    //save_freq_100("freq100.csv");
    //save_freq_100_ave("freq100_ave.csv");

if ((number_of_peaks >= 1) && (number_of_troughs >= 1))
{
    clustering_distance = troughs[0] / 2;
    printf("\n");
    printf("Clustering has been found\n");
    printf("The clustering distance used to identify clusters = %d\n", clustering_distance);
    for (i=0; i<number_of_fields; i++)
    {
        if (population[best].genes[i])
        {
            printf("Clustering using field %d\n", i);
        }
    }
    initAllocatedCluster();
    initClusterSizes();
    clusterData(population[best].genes, number_of_records, clustering_distance);
    reportClusters();
    printf("\n");
}
else
{
    printf("\n");
    printf("Clustering has NOT been found\n");
    printf("\n");
}
*/

Recommended Answers

All 3 Replies

Gee. Only 1900 lines of code. What kind of help are you expecting?

The code snippet posts are for sharing completed works you want to share. As to language conversions you would turn to prior discussions about such work or if you don't plan to do this yourself post this as a For Hire along with project details, deadlines and pay.

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.