#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>


#define TRAINING_LIST    "./rc/training.list"
#define TRAINING_FILE   "./training_set/%s"
#define TEST_PATH       "./test_set/%s"
#define PREDICTION_FILE "./prediction.txt"
#define FEATURE_FILE "./feat/features.txt"

#define MAX_RATINGS     100480508     // Ratings in entire training set (+1)
#define MAX_CUSTOMERS   480190        // Customers in the entire training set (+1)
#define MAX_MOVIES      17771         // Movies in the entire training set (+1)
#define MAX_FEATURES    64            // Number of features to use 
#define MIN_EPOCHS      120           // Minimum number of epochs per feature
#define MAX_EPOCHS      200           // Max epochs per feature

#define MIN_IMPROVEMENT 0.0001        // Minimum improvement required to continue current feature
#define INIT            0.1           // Initialization value for features
#define LRATE           0.001         // Learning rate parameter
#define K               0.015         // Regularization parameter used to minimize over-fitting
#define HASH_SIZE 54321
typedef unsigned char BYTE;
struct Movie
{
    int         RatingCount;
    int         RatingSum;
    double      RatingAvg;            
    double      PseudoAvg;            // Weighted average used to deal with small movie counts 
};

struct Customer
{
    int         CustomerId;
    int         RatingCount;
    int         RatingSum;
};

struct Data
{
    int         CustId;
    short       MovieId;
    char        Rating;
    float       Cache;
};

struct Hash
{
   int CustId;
   int RunningId;
};

class Engine 
{
private:
    int             m_nRatingCount;                                 // Current number of loaded ratings
    Data            m_aRatings[MAX_RATINGS];                        // Array of ratings data
    Movie           m_aMovies[MAX_MOVIES];                          // Array of movie metrics
    Customer        m_aCustomers[MAX_CUSTOMERS];                    // Array of customer metrics
    float           m_aMovieFeatures[MAX_FEATURES][MAX_MOVIES];     // Array of features by movie (using floats to save space)
    float           m_aCustFeatures[MAX_FEATURES][MAX_CUSTOMERS];   // Array of features by customer (using floats to save space)
    int             m_nCustomers;
    Hash*           m_aCustomerHash[HASH_SIZE];
    int             m_nCustomerHashSize[HASH_SIZE];
    int             m_nCustomerHashPos[HASH_SIZE];

    inline double   PredictRating(short movieId, int custId, int feature, float cache, bool bTrailing=true);
    inline double   PredictRating(short movieId, int custId);

    bool            ReadNumber(char* pwzBufferIn, int nLength, int &nPosition, char* pwzBufferOut);
    bool            ParseInt(char* pwzBuffer, int nLength, int &nPosition, int& nValue);
    bool            ParseFloat(char* pwzBuffer, int nLength, int &nPosition, float& fValue);
    int             GetCustomerHashId( int CustId );
    int             GetCustomerId( int CustId );

public:
    Engine(void);
    ~Engine(void) { };

    void            CalcMetrics();
    void            CalcFeatures();
    void            SaveFeatures();
    void            LoadHistory();
    void            ProcessTest(char* pwzFile);
    void            ProcessFile(char* pwzFile);
};


//===================================================================
//
// Program Main
//
//===================================================================
int main(int argc, char* argv[])
{
    Engine* engine = new Engine();

    engine->LoadHistory();
    engine->CalcMetrics();
    engine->CalcFeatures();
    engine->SaveFeatures();
	engine->ProcessTest("qualifying.txt");

    printf("\nDone\n");
    getchar();
    
    return 0;
}
//===================================================================
//
// Engine Class 
//
//===================================================================

//-------------------------------------------------------------------
// Initialization
//-------------------------------------------------------------------

Engine::Engine(void)
{
    m_nRatingCount = 0;

    for (int f=0; f<MAX_FEATURES; f++)
    {
        for (int i=0; i<MAX_MOVIES; i++) m_aMovieFeatures[f][i] = (float)INIT;
        for (int i=0; i<MAX_CUSTOMERS; i++) m_aCustFeatures[f][i] = (float)INIT;
    }

    m_nCustomers = 0;

    for (int f=0; f<HASH_SIZE; f++) {
        m_aCustomerHash[f] = (Hash*)malloc( 10 * sizeof(Hash) );
        m_nCustomerHashPos[f] = 0;
        m_nCustomerHashSize[f] = 10;
    }
}

//-------------------------------------------------------------------
// Calculations - This section contains all of the relevant code
//-------------------------------------------------------------------

//
// CalcMetrics
// - Loop through the history and pre-calculate metrics used in the training 
// - Also re-number the customer id's to fit in a fixed array
//
void Engine::CalcMetrics()
{
    int i, cid, k, index;
    Hash* htemp;
    /* IdItr itr; */

    printf("\nCalculating intermediate metrics\n\n");

    // Process each row in the training set
    for (i=0; i<m_nRatingCount; i++)
    {
        if ((i % 1000000)==0) printf("."), fflush(stdout);

        Data* rating = m_aRatings + i;

        // Increment movie stats
        m_aMovies[rating->MovieId].RatingCount++;
        m_aMovies[rating->MovieId].RatingSum += rating->Rating;
        
        // Add customers (using a map to re-number id's to array indexes) 
        cid = GetCustomerId( rating->CustId ); 
        if ( cid<0 )
        {
            /* find pos to insert CustId */
            index = GetCustomerHashId(rating->CustId);

            if (m_nCustomerHashSize[index] <= m_nCustomerHashPos[index]) {
               m_nCustomerHashSize[index] += 10;
               htemp = (Hash*)malloc( m_nCustomerHashSize[index] * sizeof(Hash) );
               for(k=0;k<m_nCustomerHashPos[index];k++) {
                  htemp[k].CustId = m_aCustomerHash[index][k].CustId;
                  htemp[k].RunningId = m_aCustomerHash[index][k].RunningId;
               }
               free(m_aCustomerHash[index]);
               m_aCustomerHash[index] = htemp;
            }

            cid = m_nCustomers++;

            m_aCustomerHash[index][m_nCustomerHashPos[index]].CustId = rating->CustId;
            m_aCustomerHash[index][m_nCustomerHashPos[index]].RunningId = cid;
            m_nCustomerHashPos[index]++;

            if (m_nCustomers > MAX_CUSTOMERS) {
               printf("too many customers!\n");
               return;
            }

            // Store off old sparse id for later
            m_aCustomers[cid].CustomerId = rating->CustId;

            // Init vars to zero
            m_aCustomers[cid].RatingCount = 0;
            m_aCustomers[cid].RatingSum = 0;
        }

        // Swap sparse id for compact one
        rating->CustId = cid;

        m_aCustomers[cid].RatingCount++;
        m_aCustomers[cid].RatingSum += rating->Rating;
    }

    // Do a follow-up loop to calc movie averages
    for (i=0; i<MAX_MOVIES; i++)
    {
        Movie* movie = m_aMovies+i;
        movie->RatingAvg = movie->RatingSum / (1.0 * movie->RatingCount);
        movie->PseudoAvg = (3.23 * 25 + movie->RatingSum) / (25.0 + movie->RatingCount);
    }
    printf("\n");
}

//
// SaveFeatures
//
void Engine::SaveFeatures()
{
    FILE *f;
    int i,j;

    f = fopen(FEATURE_FILE,"w");

    fprintf(f,"Featurefile:Version1.0\n");

    for(i=0;i<MAX_FEATURES;i++) {
       fprintf(f,"\nMOVIE[%d]\n",i);
       for(j=0;j<MAX_MOVIES;j++)
          fprintf(f," %g",m_aMovieFeatures[i][j]);
       fprintf(f,"\nCUSTOMER[%d]\n",i);
       for(j=0;j<MAX_CUSTOMERS;j++)
          fprintf(f," %g",m_aCustFeatures[i][j]);
    }
    fprintf(f,"\n");

    fclose(f);
}

//
// CalcFeatures
// - Iteratively train each feature on the entire data set
// - Once sufficient progress has been made, move on
//
void Engine::CalcFeatures()
{
    int f, e, i, custId, cnt = 0;
    Data* rating;
    double err, p, sq, rmse_last, rmse = 2.0;
    short movieId;
    float cf, mf;

    for (f=0; f<MAX_FEATURES; f++)
    {
        printf("\n--- Calculating feature: %d ---\n", f);

        // Keep looping until you have passed a minimum number 
        // of epochs or have stopped making significant progress 
        for (e=0; (e < MIN_EPOCHS) || (rmse <= rmse_last - MIN_IMPROVEMENT); e++)
        {
            cnt++;
            sq = 0;
            rmse_last = rmse;

            for (i=0; i<m_nRatingCount; i++)
            {
                rating = m_aRatings + i;
                movieId = rating->MovieId;
                custId = rating->CustId;

                // Predict rating and calc error
                p = PredictRating(movieId, custId, f, rating->Cache, true);
                err = (1.0 * rating->Rating - p);
                sq += err*err;
                
                // Cache off old feature values
                cf = m_aCustFeatures[f][custId];
                mf = m_aMovieFeatures[f][movieId];

                // Cross-train the features
                m_aCustFeatures[f][custId] += (float)(LRATE * (err * mf - K * cf));
                m_aMovieFeatures[f][movieId] += (float)(LRATE * (err * cf - K * mf));
            }
            
            rmse = sqrt(sq/m_nRatingCount);
                  
            printf("     <set x='%d' y='%f' />\n",cnt,rmse);
        }

        // Cache off old predictions
        for (i=0; i<m_nRatingCount; i++)
        {
            rating = m_aRatings + i;
            rating->Cache = (float)PredictRating(rating->MovieId, rating->CustId, f, rating->Cache, false);
        }            
    }
}

//
// PredictRating
// - During training there is no need to loop through all of the features
// - Use a cache for the leading features and do a quick calculation for the trailing
// - The trailing can be optionally removed when calculating a new cache value
//
double Engine::PredictRating(short movieId, int custId, int feature, float cache, bool bTrailing)
{
    // Get cached value for old features or default to an average
    double sum = (cache > 0) ? cache : 1; //m_aMovies[movieId].PseudoAvg; 

    // Add contribution of current feature
    sum += m_aMovieFeatures[feature][movieId] * m_aCustFeatures[feature][custId];
    if (sum > 5) sum = 5;
    if (sum < 1) sum = 1;

    // Add up trailing defaults values
    if (bTrailing)
    {
        sum += (MAX_FEATURES-feature-1) * (INIT * INIT);
        if (sum > 5) sum = 5;
        if (sum < 1) sum = 1;
    }

    return sum;
}

//
// PredictRating
// - This version is used for calculating the final results
// - It loops through the entire list of finished features
//
double Engine::PredictRating(short movieId, int custId)
{
    double sum = 1; //m_aMovies[movieId].PseudoAvg;

    for (int f=0; f<MAX_FEATURES; f++) 
    {
        sum += m_aMovieFeatures[f][movieId] * m_aCustFeatures[f][custId];
        if (sum > 5) sum = 5;
        if (sum < 1) sum = 1;
    }

    return sum;
}

//-------------------------------------------------------------------
// Data Loading / Saving
//-------------------------------------------------------------------

//
// LoadHistory
// - Loop through all of the files in the training directory
//
void Engine::LoadHistory()
{
    FILE *lfile;
    bool bContinue = true;
    int read,count = 0; // TEST
    char buf[50];

    lfile = fopen(TRAINING_LIST,"r");
    read = fscanf(lfile,"%s\n",&buf);
    
    while (read>0) 
    {
        this->ProcessFile(buf);
        read = fscanf(lfile,"%s\n",&buf);

        // if (++count > 999) break; // TEST: Uncomment to only test with the first X movies
    } 

    /* FindClose(hFind); */
    fclose(lfile);
}

//
// ProcessFile
// - Load a history file in the format:
//
//   <MovieId>:
//   <CustomerId>,<Rating>
//   <CustomerId>,<Rating>
//   ...
void Engine::ProcessFile(char* pwzFile)
{
    FILE *stream;
    char pwzBuffer[1000];
    sprintf(pwzBuffer,TRAINING_FILE,pwzFile);
    int custId, movieId, rating, pos = 0;
    int read;
    
    printf("Processing file: %s\n", pwzBuffer);

    stream = fopen(pwzBuffer, "r");
    if (!stream) return;

    // First line is the movie id
    fscanf(stream,"%s\n",&pwzBuffer);
    ParseInt(pwzBuffer, (int)strlen(pwzBuffer), pos, movieId);
    m_aMovies[movieId].RatingCount = 0;
    m_aMovies[movieId].RatingSum = 0;

    // Get all remaining rows
    read = fscanf(stream,"%s\n",&pwzBuffer);
    while ( read>0 )
    {
        pos = 0;
        ParseInt(pwzBuffer, (int)strlen(pwzBuffer), pos, custId);
        ParseInt(pwzBuffer, (int)strlen(pwzBuffer), pos, rating);
        
        m_aRatings[m_nRatingCount].MovieId = (short)movieId;
        m_aRatings[m_nRatingCount].CustId = custId;
        m_aRatings[m_nRatingCount].Rating = (BYTE)rating;
        m_aRatings[m_nRatingCount].Cache = 0;
        m_nRatingCount++;

        read = fscanf(stream,"%s\n",&pwzBuffer);
    }

    // Cleanup
    fclose( stream );
}

//
// ProcessTest
// - Load a sample set in the following format
//
//   <Movie1Id>:
//   <CustomerId>
//   <CustomerId>
//   ...
//   <Movie2Id>:
//   <CustomerId>
//
// - And write results:
//
//   <Movie1Id>:
//   <Rating>
//   <Raing>
//   ...
void Engine::ProcessTest(char* pwzFile)
{
    FILE *streamIn, *streamOut;
    char pwzBuffer[1000];
    int custId, movieId, pos = 0;
    double rating;
    bool bMovieRow;
    int read;

    sprintf(pwzBuffer, TEST_PATH, pwzFile);
    printf("\n\nProcessing test: %s\n", pwzBuffer);

    streamIn = fopen(pwzBuffer, "r");
    if (!streamIn) return;
    streamOut = fopen(PREDICTION_FILE, "w");
    if (!streamOut) return;

    read = fscanf(streamIn,"%s\n",&pwzBuffer);
    while ( read>0 )
    {
        bMovieRow = false;
        for (int i=0; i<(int)strlen(pwzBuffer); i++)
        {
            bMovieRow |= (pwzBuffer[i] == 58); 
        }

        pos = 0;
        if (bMovieRow)
        {
            ParseInt(pwzBuffer, (int)strlen(pwzBuffer), pos, movieId);

            // Write same row to results
            fprintf(streamOut,"%s\n",&pwzBuffer);
        }
        else
        {
            ParseInt(pwzBuffer, (int)strlen(pwzBuffer), pos, custId);
            custId = GetCustomerId(custId);
            rating = PredictRating(movieId, custId);

            // Write predicted value
            sprintf(pwzBuffer,"%5.3f\n",rating);
            fprintf(streamOut,"%s",&pwzBuffer);
        }

        //wprintf(L"Got Line: %d %d %d \n", movieId, custId, rating);
        read = fscanf(streamIn,"%s\n",&pwzBuffer);
    }

    // Cleanup
    fclose( streamIn );
    fclose( streamOut );
}

//-------------------------------------------------------------------
// Helper Functions
//-------------------------------------------------------------------
bool Engine::ReadNumber(char* pwzBufferIn, int nLength, int &nPosition, char* pwzBufferOut)
{
    int count = 0;
    int start = nPosition;
    char wc = 0;

    // Find start of number
    while (start < nLength)
    {
        wc = pwzBufferIn[start];
        if ((wc >= 48 && wc <= 57) || (wc == 45)) break;
        start++;
    }

    // Copy each character into the output buffer
    nPosition = start;
    while (nPosition < nLength && ((wc >= 48 && wc <= 57) || wc == 69  || wc == 101 || wc == 45 || wc == 46))
    {
        pwzBufferOut[count++] = wc;
        wc = pwzBufferIn[++nPosition];
    }

    // Null terminate and return
    pwzBufferOut[count] = 0;
    return (count > 0);
}

bool Engine::ParseFloat(char* pwzBuffer, int nLength, int &nPosition, float& fValue)
{
    char pwzNumber[20];
    bool bResult = ReadNumber(pwzBuffer, nLength, nPosition, pwzNumber);
    fValue = (bResult) ? (float)atof(pwzNumber) : 0;
    return false;
}

bool Engine::ParseInt(char* pwzBuffer, int nLength, int &nPosition, int& nValue)
{
    char pwzNumber[20];
    bool bResult = ReadNumber(pwzBuffer, nLength, nPosition, pwzNumber);
    nValue = (bResult) ? atoi(pwzNumber) : 0;
    return bResult;
}

int Engine::GetCustomerHashId( int CustId )
{
   return CustId % HASH_SIZE;
}

int Engine::GetCustomerId(int CustId)
{
    static int i,index;

    index = GetCustomerHashId( CustId );

    for(i=0;i<m_nCustomerHashPos[index];i++)
       if (m_aCustomerHash[index][i].CustId == CustId) return m_aCustomerHash[index][i].RunningId;

    return -1;
}

i am using visual C++ express edition to run this code....it compiles without any error but when i debug it, it gives me the following msg...there is a screen shot of tht msg...pls suggest me some ways to solve it and help me debug it

Attachments Untitled.png 135.87 KB

Find out where you are calling fscanf() then check that the FILE stream has been opened successfully (check if it's not NULL). You probably are not checking if fopen() succeeded.

Find out where you are calling fscanf() then check that the FILE stream has been opened successfully (check if it's not NULL). You probably are not checking if fopen() succeeded.

if u do not mind can u pls edit it there in my code...it gives me the error - (stream!=NULL) ... and do u think that i will be able to run it with visual C++ 2008 express edition or will i have to use some other compiler...

Well, I'm not about to fix the bug for you. You can do it, so you fix it. If the program compiles ok with VC++ express then you should be able to run it ok too. The error message you posted would happen regardless of the compiler. You just have to figure out what caused it.

Well, I'm not about to fix the bug for you. You can do it, so you fix it. If the program compiles ok with VC++ express then you should be able to run it ok too. The error message you posted would happen regardless of the compiler. You just have to figure out what caused it.

hmm ok i will try ot do it....if ever u can help me out wid d bug then pls do...anyways thanks for the help...

void Engine::LoadHistory()
{
    FILE *lfile;
    bool bContinue = true;
    int read,count = 0; // TEST
    char buf[50];

    lfile = fopen(TRAINING_LIST,"r");
    read = fscanf(lfile,"%s\n",&buf);

The problem is here. Check if the file was opened and quit if not.

void Engine::LoadHistory()
{
    FILE *lfile;
    bool bContinue = true;
    int read,count = 0; // TEST
    char buf[50];

    lfile = fopen(TRAINING_LIST,"r");
    read = fscanf(lfile,"%s\n",&buf);

The problem is here. Check if the file was opened and quit if not.

hey thanks man...i did edit it as follows -

void Engine::LoadHistory()
{
    FILE *lfile;
    bool bContinue = true;
    int read,count = 0; // TEST
    char buf[50];

    lfile = fopen(TRAINING_LIST,"r");
    if (!lfile) return;
    read = fscanf(lfile,"%s\n",&buf);

it does and then gives me other error..i have attached the screen shot for that...

Attachments Untitled2.png 173.16 KB

Apparently there are other similar errors. Find and fix them. Hunt and destroy!

ohk i will hunt for it....but when u have spare time can u please help me out...thank u so much...

This article has been dead for over six months. Start a new discussion instead.