0

I discover a performance pitfall of vc2010, I compile it by release mode(O2).The results are pretty unacceptable at the first time.

#include<ctime>
#include<iostream>

/*
 * This is a small performance pitfall I discover from vc2010,
 * for simplicity, I didn't separate the header file and source file.
 * At first, I wonder that maybe the template of vc2010 has some problem,
 * so I use void* to instead template, yet the result is the same.After
 * some trials and errors, I discover the problem, let us take a look.
 */

/* Estimate the execution time of function like object
 *
 * parameter :
 * Func : function object or function pointer
 * loop : times of loop
 * name : name of the op
 *
 * return the times spent by op(seconds)
 *
 * exception : may throw any exception(depend to Func)
 */
template<typename Func>
double timeCount(Func op, size_t loop = 1, char const* name = "function")
{
  std::clock_t begin, end;
  begin = std::clock();
  for(size_t i = 0; i != loop; ++i) op();
  end = std::clock();
  double const RESULT = (end - begin) 
                        / static_cast<double>(CLOCKS_PER_SEC);
  size_t const HOUR = static_cast<size_t>(RESULT / 3600);
  size_t const MIN = static_cast<size_t>(RESULT) / 60 - HOUR * 60;
  double const SEC = RESULT - static_cast<double>(MIN * 60 + HOUR * 3600);
  std::cout<<"time of the(hours : min : sec) "
           <<name<<" : "<<HOUR<<" : "<<MIN<<" : "<<SEC;
  std::cout<<"\nloop "<<loop<<" times\n";
  return RESULT;
}

/* function similar to std::find
 *
 * parameter :
 * begin, end :  address of the Input iterators to the initial 
 *               and final positions in a sequence
 *               The range used is [begin,end), which contains 
 *               all the elements between begin 
 *               and end, including the element pointed by begin 
 *               but not the element pointed by end.
 * value      : value you want to find
 *
 * return the address of value if found, else return the address of end
 *
 * exception : strong exception guarantee
 */
void* CFind3(void* begin, void* end, int value)
{  
 while(begin != end)
  {
    if(*(int*)begin == value) break;
    begin = (char*)begin + sizeof(int);
  }
  return begin;
}

/* function similar to std::find
 *
 * parameter :
 * begin : address of the beginning
 * num : number of the elements
 * value : value you want to find
 *
 * return the address of value if found, else return the address of end
 *
 * exception : strong exception guarantee
 */
void* CFind4(void *begin, size_t num, int value)
{ 
 size_t i = 0;
 for(; i != num; ++i)
 {
  if(*(int*)begin == value) break;
  begin = (char*)(begin) + sizeof(int);
 }
 return begin;
}

/* 
 * measure the execution time of CFind3 by running 1E8 times
 */
void testCFind3()
{
 int data[] = {1, 20, 33, 55, 22, 77, 88, 99, 3000, 8880, 
               800, 777, 345, 976, 345, 222, 111, 0, 77, 
               88, 99, 1, 2, 3, 20000};
 size_t const NUM = (size_t)(1E6);
 size_t i = 0;
 for(i = 0; i != NUM; ++i) 
   void *temp = CFind3(&data[0], &data[0] + 
                       sizeof(data) / sizeof(*data), 20000); //#1
}

/* 
 * measure the execution time of CFind4 by running 1E8 times
 */
void testCFind4()
{
 int data[] = {1, 20, 33, 55, 22, 77, 88, 99, 3000, 8880, 800, 777, 
               345, 976, 345, 222, 111, 0, 77, 88, 99, 1, 2, 3, 20000};
 size_t const NUM = (size_t)(1E6);
 size_t i = 0;
 for(i = 0; i != NUM; ++i) 
   void *temp = CFind4(&data[0], sizeof(data) / 
                                 sizeof(*data), 20000); //#2
}

/* 
 * measure the execution time of CFind3 by running 1E8 times
 * this function is almost the same as testCFind3, but I use 
 * some trick to "boost up" the performance of it
 */

int main()
{  
 //compare the speed of CFind4 and CFind3
  timeCount(testCFind3, static_cast<size_t>(1E2), "testCFind3");
  timeCount(testCFind4, static_cast<size_t>(1E2), "testCFind4"); 
 
  std::cout<<"system pause"<<std::endl;
  std::cin.get();
  return 0;
}

You will find out that the speed of CFind3 is much more faster than CFind4,what is going on? The problem is CFind4 really run the loop 1E8 times but CFind3 did not. Let us try to "boost up" the speed of CFind3 by some effort.

/* 
 * measure the execution time of CFind3 by running 1E8 times
 * this function is almost the same as testCFind3, but I use 
 * some trick to "boost up" the performance of it
 */
void testCFind3_boost_up()
{
 int data[] = {1, 20, 33, 55, 22, 77, 88, 99, 3000, 8880, 800, 777, 
               345, 976, 345, 222, 111, 0, 77, 88, 99, 1, 2, 3, 20000};
 size_t const NUM = (size_t)(1E6);
 size_t i = 0;
 //make &data[0] + sizeof(data) / sizeof(*data) as const
 void *end = &data[0] + sizeof(data) / sizeof(*data); 
 for(i = 0; i != NUM; ++i) void *temp = CFind3(&data[0], end, 20000); //#3
}

Now you run the program again, you will find out that the time of testCFind3_boost_up and CFind4 are almost equal.The problem of optimization but not a bug of vc2010 and vc2008.If you like, you can check on the assembly codes at #1, #2 and #3.Since sizeof(int) / sizeof(*int) is a constant in CFind4, the compiler could optimize the codes better.

Edited by stereomatching: n/a

1
Contributor
1
Reply
2
Views
5 Years
Discussion Span
Last Post by stereomatching
0

Forgot to say, I used static_cast to cast void* to int*, I change it to
C style casting because I was seduced by the myth "C++ must slower than C".
Apparently, the problem is not static_cast or C style casting, it is just a
problem of optimization of vc2010.

If you find out any mistakes or anything could be improved,
please give me some comments.

This topic has been dead for over six months. Start a new discussion instead.
Have something to contribute to this discussion? Please be thoughtful, detailed and courteous, and be sure to adhere to our posting rules.