In the following code, I need to optimize it for the fastest performance. I'm not asking anyone to do it for me, I'm asking if someone can look through the code, and let me know where I can get the best speed up by example loop unrolling a for loop or something like that. I've gone through it, and I just can not see what I should do to improve the speed.

/********************************************************
 * Kernels to be optimized for the CS:APP Performance Lab
 ********************************************************/

#include <stdio.h>
#include <stdlib.h>
#include "defs.h"

/* 
 * Please fill in the following team struct 
 */
team_t team = {
    "flankerboy",              /* Team name */

    "",     /* First member full name */
    "",  /* First member email address */

    "",                   /* Second member full name (leave blank if none) */
    ""                    /* Second member email addr (leave blank if none) */
};

/***************
 * ROTATE KERNEL
 ***************/

/******************************************************
 * Your different versions of the rotate kernel go here
 ******************************************************/

/* 
 * naive_rotate - The naive baseline version of rotate 
 */
char naive_rotate_descr[] = "naive_rotate: Naive baseline implementation";
void naive_rotate(int dim, pixel *src, pixel *dst) 
{
    int i, j;

    for (i = 0; i < dim; ++i)
	for (j = 0; j < dim; ++j)
	    dst[RIDX(dim-1-j, i, dim)] = src[RIDX(i, j, dim)];
}

/* 
 * rotate - Your current working version of rotate
 * IMPORTANT: This is the version you will be graded on
 */
char rotate_descr[] = "rotate: Current working version";
void rotate(int dim, pixel *src, pixel *dst) 
{
    naive_rotate(dim, src, dst);
}

/*********************************************************************
 * register_rotate_functions - Register all of your different versions
 *     of the rotate kernel with the driver by calling the
 *     add_rotate_function() for each test function. When you run the
 *     driver program, it will test and report the performance of each
 *     registered test function.  
 *********************************************************************/

void register_rotate_functions() 
{
    add_rotate_function(&naive_rotate, naive_rotate_descr);   
    add_rotate_function(&rotate, rotate_descr);   
    /* ... Register additional test functions here */
}


/***************
 * SMOOTH KERNEL
 **************/

/***************************************************************
 * Various typedefs and helper functions for the smooth function
 * You may modify these any way you like.
 **************************************************************/

/* A struct used to compute averaged pixel value */
typedef struct {
    int red;
    int green;
    int blue;
    int num;
} pixel_sum;

/* Compute min and max of two integers, respectively */
static int min(int a, int b) { return (a < b ? a : b); }
static int max(int a, int b) { return (a > b ? a : b); }

/* 
 * initialize_pixel_sum - Initializes all fields of sum to 0 
 */
static void initialize_pixel_sum(pixel_sum *sum) 
{
    sum->red = sum->green = sum->blue = 0;
    sum->num = 0;
    return;
}

/* 
 * accumulate_sum - Accumulates field values of p in corresponding 
 * fields of sum 
 */
static void accumulate_sum(pixel_sum *sum, pixel p) 
{
    sum->red += (int) p.red;
    sum->green += (int) p.green;
    sum->blue += (int) p.blue;
    sum->num++;
    return;
}

/* 
 * assign_sum_to_pixel - Computes averaged pixel value in current_pixel 
 */
static void assign_sum_to_pixel(pixel *current_pixel, pixel_sum sum) 
{
  current_pixel->red = (unsigned short) (sum.red/sum.num);
  current_pixel->green = (unsigned short) (sum.green/sum.num);
  current_pixel->blue = (unsigned short) (sum.blue/sum.num);
    return;
}

/* 
 * avg - Returns averaged pixel value at (i,j) 
 */
static pixel avg(int dim, int i, int j, pixel *src) 
{
    int ii, jj;
    pixel_sum sum;
    pixel current_pixel;

    initialize_pixel_sum(&sum);
    for(ii = max(i-1, 0); ii <= min(i+1, dim-1); ++ii) 
	for(jj = max(j-1, 0); jj <= min(j+1, dim-1); ++jj) 
	    accumulate_sum(&sum, src[RIDX(ii, jj, dim)]);

    assign_sum_to_pixel(&current_pixel, sum);
    return current_pixel;
}


/******************************************************
 * Your different versions of the smooth kernel go here
 ******************************************************/

/*
 * naive_smooth - The naive baseline version of smooth 
 */
char naive_smooth_descr[] = "naive_smooth: Naive baseline implementation";
void naive_smooth(int dim, pixel *src, pixel *dst) 
{
    int i, j;

    for (i = 0; i < dim; ++i)
	for (j = 0; j < dim; ++j)
	    dst[RIDX(i, j, dim)] = avg(dim, i, j, src);
}



/*
 * smooth - Your current working version of smooth. 
 * IMPORTANT: This is the version you will be graded on
 */
char smooth_descr[] = "smooth: Current working version";
void smooth(int dim, pixel *src, pixel *dst) 
{
    naive_smooth(dim, src, dst);
}


/********************************************************************* 
 * register_smooth_functions - Register all of your different versions
 *     of the smooth kernel with the driver by calling the
 *     add_smooth_function() for each test function.  When you run the
 *     driver program, it will test and report the performance of each
 *     registered test function.  
 *********************************************************************/

void register_smooth_functions() {
    add_smooth_function(&smooth, smooth_descr);
    add_smooth_function(&naive_smooth, naive_smooth_descr);
    /* ... Register additional test functions here */
}

Whenever I want to optimize a program, I usually start by trying to map the entire problem mathematically, simplify it, then re-code the simplified formula. I don't have time right now to look at your code, but maybe that bit of advice might help you.

The first step in optimizing is to ask yourself, "do I really need to optimize this?" I see "Kernels to be optimized for the CS:APP Performance Lab" in your comments, so I'm assuming the answer here is "yes." :)

In the wild, though, the answer is just as likely to be "no." I prefer to see people write inefficient code that is easy to read and understand rather than a clever, efficient algorithm that is inaccessible to the average coder. The answer becomes "yes" when you can demonstrate a clear, quantifiable need for efficiency.

The second step is to ask yourself, "what should I be optimizing?" The code you posted is short enough that you might answer that question analytically, i.e., read the code and try to find parts that look like they might be slow. If this exercise is supposed to be analytical, then I recommend you look very closely at naive_rotate and naive_smooth --the word "naive" is usually an obvious marker of inefficiency. Do as N1GHTS suggests and rewrite those functions in plain English to understand what they're actually doing. I suspect that the code inside the inner loops can be simplified, and you may be able to shorten or even eliminate the inner loops themselves.

In the field, you'll want to measure an inefficient program to see where the performance-critical areas are. The last thing you want is to spend a lot of time making a particular function run blindingly fast only to discover that your application is spending most of its time doing something else. The real-world answer to the second question is given by running a profiler to get some concrete evidence of actual performance in your hands. While your exercise is simple enough that it doesn't really require that level of detail, you still may find it useful to run one, to measure the performance of your new code against the original, naive functions.

I prefer to see people write inefficient code that is easy to read and understand rather than a clever, efficient algorithm that is inaccessible to the average coder

Comments are a requirement for any well written program -- complex code is valid as long as equal emphasis is given to its corresponding comments. For a very complex line of code, I like to write mini essays on the concept behind the algorithm in a comment block. I don't believe in "easy to read code" because code is not human in the first place. There is no excuse to write bad code, and there is no excuse to write bad comments.

I personally have a 60/40 ratio to all my unique code, where 60% is plain english and 40% is code. I don't care how redundant it is to write /* increment the counter */ as long as there is a consistent flow of explanation in the code. Additionally, my code in assembly language has about an 80/20 ratio of comments to code. Given all that effort, those who work with my code find it very easy to work with despite its highly optimized nature.

Now, this is my own opinion. Your statement is technically correct and is probably worth money.

Comments are a requirement for any well written program -- complex code is valid as long as equal emphasis is given to its corresponding comments. ... There is no excuse to write bad code, and there is no excuse to write bad comments.

I don't think we disagree; that sounds sane to me. I was more concerned with emphasizing the evils of premature optimization. Complex code is fine; it's just that in practice, you'll have a limited amount of time to spend on any given project, so save it for when it really matters. Also, there are enough people working in software that don't truly understand what they're doing (not referring to anyone here, just a comment on some of the 'professionals' I've encountered), so all else being equal, simpler code is more idiot-proof.

How do you run a profiler?

Depends on which one you use--profilers work in a variety of different ways. Search the Internet for "C profiler" or something similar and have a look at what's available.

If you use an IDE, it may provide some integration already--for example, Code::Blocks has a plugin that runs gprof.

I thank yall for your help, I have spent the last 3 days rethinking the code, and I came up with a way to speed it up a lot. If you like to see the final product of what I have Id be more than welcome to share.

Hello Charlton21;
I'm working on the same assignment and read the advice you got and seems like a good idea. How well did you code optimize?
Edward (moonbow)

This article has been dead for over six months. Start a new discussion instead.