Commit c1487e2d authored by maitre's avatar maitre

TGP on GPU working version

parent f4256389
......@@ -733,6 +733,9 @@ if(OPERATING_SYSTEM=WINDOWS)
if( TARGET==CUDA )
strcat(sFileName,"Individual.cu");
else if( TARGET==STD )
if( TARGET_FLAVOR==CUDA_FLAVOR_GP)
strcat(sFileName,"Individual.cu");
else
strcat(sFileName,"Individual.cpp");
fpOutputFile=fopen(sFileName,"w");
if (bVERBOSE) printf("Creating %s...\n",sFileName);
......
50
0.680375,0.133533
-0.211234,0.040727
0.566198,0.147983
0.596880,0.147634
0.823295,0.070360
-0.604897,0.147122
-0.329554,0.086297
0.536459,0.145979
-0.444451,0.127203
0.107940,0.011381
-0.045206,0.002035
0.257742,0.057898
-0.270431,0.062827
0.026802,0.000717
0.904459,0.027083
0.832390,0.065356
0.271423,0.063216
0.434594,0.124264
-0.716795,0.121459
0.213938,0.041676
-0.967399,0.003850
-0.514226,0.143073
-0.725537,0.118069
0.608353,0.146846
-0.686642,0.131701
-0.198111,0.036228
-0.740419,0.111894
-0.782382,0.092093
0.997849,0.000018
-0.563486,0.147894
0.025865,0.000668
0.678224,0.134138
0.225280,0.045730
-0.407937,0.115635
0.275105,0.064660
0.048574,0.002348
-0.012834,0.000165
0.945550,0.010033
-0.414966,0.117999
0.542715,0.146585
0.053490,0.002845
0.539828,0.146317
-0.199543,0.036710
0.783059,0.091750
-0.433371,0.123889
-0.295083,0.072571
0.615449,0.146177
0.838053,0.062231
-0.860489,0.049884
0.898654,0.029901
This diff is collapsed.
#define NUMTHREAD2 128
#define MAX_STACK 50
#define LOGNUMTHREAD2 7
#define HIT_LEVEL 0.01f
#define PROBABLY_ZERO 1.11E-15f
#define BIG_NUMBER 1.0E15f
__global__ static void
EvaluatePostFixIndividuals_128(const float * k_progs,
const int maxprogssize,
const int popsize,
const float * k_inputs,
const float * k_outputs,
const int trainingSetSize,
float * k_results,
int *k_hits,
int* k_indexes
)
{
__shared__ float tmpresult[NUMTHREAD2];
__shared__ float tmphits[NUMTHREAD2];
const int tid = threadIdx.x; //0 to NUM_THREADS-1
const int bid = blockIdx.x; // 0 to NUM_BLOCKS-1
int index; // index of the prog processed by the block
float sum = 0.0;
int hits = 0 ; // hits number
float currentX, currentOutput;
float result;
int start_prog;
int codop;
float stack[MAX_STACK];
int sp;
float op1, op2;
float tmp;
index = bid; // one program per block => block ID = program number
if (index >= popsize) // idle block (should never occur)
return;
if (k_progs[index] == -1.0) // already evaluated
return;
// Here, it's a busy thread
sum = 0.0;
hits = 0 ; // hits number
// Loop on training cases, per cluster of 32 cases (= number of thread)
// (even if there are only 8 stream processors, we must spawn at least 32 threads)
// We loop from 0 to upper bound INCLUDED in case trainingSetSize is not
// a multiple of NUMTHREAD
for (int i=0; i < ((trainingSetSize-1)>>LOGNUMTHREAD2)+1; i++) {
// are we on a busy thread?
if (i*NUMTHREAD2+tid >= trainingSetSize) // no!
continue;
currentX = k_inputs[i*NUMTHREAD2+tid];
currentOutput = k_outputs[i*NUMTHREAD2+tid];
start_prog = k_indexes[index]; // index of first codop
codop = k_progs[start_prog++];
sp = 0; // stack and stack pointer
while (codop != OP_RETURN){
switch(codop)
{
case OP_W :
stack[sp++] = currentX;
break;
case OP_ERC:
tmp = k_progs[start_prog++];
stack[sp++] = tmp;
break;
case OP_MUL :
sp--;
op1 = stack[sp];
sp--;
op2 = stack[sp];
stack[sp] = __fmul_rz(op1, op2);
stack[sp] = op1*op2;
sp++;
break;
case OP_ADD :
sp--;
op1 = stack[sp];
sp--;
op2 = stack[sp];
stack[sp] = __fadd_rz(op1, op2);
stack[sp] = op1+op2;
sp++;
break;
case OP_SUB :
sp--;
op1 = stack[sp];
sp--;
op2 = stack[sp];
stack[sp] = op2 - op1;
sp++;
break;
case OP_DIV :
sp--;
op2 = stack[sp];
sp--;
op1 = stack[sp];
if (op2 == 0.0)
stack[sp] = 1.0;
else
stack[sp] = op1/op2;
sp++;
break;
}
// get next codop
codop = k_progs[start_prog++];
} // codop interpret loop
result = fabsf(stack[0] - currentOutput);
if (!(result < BIG_NUMBER))
result = BIG_NUMBER;
else if (result < PROBABLY_ZERO)
result = 0.0;
if (result <= HIT_LEVEL)
hits++;
sum += result; // sum raw error on all training cases
} // LOOP ON TRAINING CASES
// gather results from all threads => we need to synchronize
tmpresult[tid] = sum;
tmphits[tid] = hits;
__syncthreads();
if (tid == 0) {
for (int i = 1; i < NUMTHREAD2; i++) {
tmpresult[0] += tmpresult[i];
tmphits[0] += tmphits[i];
}
k_results[index] = tmpresult[0];
k_hits[index] = tmphits[0];
}
// here results and hits have been stored in their respective array: we can leave
}
#define NUMTHREAD2 128
#define MAX_STACK 50
#define LOGNUMTHREAD2 7
#define HIT_LEVEL 0.01f
#define PROBABLY_ZERO 1.11E-15f
#define BIG_NUMBER 1.0E15f
__global__ static void
EvaluatePostFixIndividuals_128(const float * k_progs,
const int maxprogssize,
const int popsize,
const float * k_inputs,
const float * k_outputs,
const int trainingSetSize,
float * k_results,
int *k_hits,
int* k_indexes
)
{
__shared__ float tmpresult[NUMTHREAD2];
__shared__ float tmphits[NUMTHREAD2];
const int tid = threadIdx.x; //0 to NUM_THREADS-1
const int bid = blockIdx.x; // 0 to NUM_BLOCKS-1
int index; // index of the prog processed by the block
float sum = 0.0;
int hits = 0 ; // hits number
float currentX, currentOutput;
float result;
int start_prog;
int codop;
float stack[MAX_STACK];
int sp;
float op1, op2;
float tmp;
index = bid; // one program per block => block ID = program number
if (index >= popsize) // idle block (should never occur)
return;
if (k_progs[index] == -1.0) // already evaluated
return;
// Here, it's a busy thread
sum = 0.0;
hits = 0 ; // hits number
// Loop on training cases, per cluster of 32 cases (= number of thread)
// (even if there are only 8 stream processors, we must spawn at least 32 threads)
// We loop from 0 to upper bound INCLUDED in case trainingSetSize is not
// a multiple of NUMTHREAD
for (int i=0; i < ((trainingSetSize-1)>>LOGNUMTHREAD2)+1; i++) {
// are we on a busy thread?
if (i*NUMTHREAD2+tid >= trainingSetSize) // no!
continue;
currentX = k_inputs[i*NUMTHREAD2+tid];
currentOutput = k_outputs[i*NUMTHREAD2+tid];
start_prog = k_indexes[index]; // index of first codop
codop = k_progs[start_prog++];
sp = 0; // stack and stack pointer
while (codop != OP_RETURN){
switch(codop)
{
case OP_W :
stack[sp++] = currentX;
break;
case OP_ERC:
tmp = k_progs[start_prog++];
stack[sp++] = tmp;
break;
case OP_MUL :
sp--;
op1 = stack[sp];
sp--;
op2 = stack[sp];
//stack[sp] = __fmul_rz(op1, op2);
stack[sp] = op1*op2;
sp++;
break;
case OP_ADD :
sp--;
op1 = stack[sp];
sp--;
op2 = stack[sp];
//stack[sp] = __fadd_rz(op1, op2);
stack[sp] = op1+op2;
sp++;
break;
case OP_SUB :
sp--;
op1 = stack[sp];
sp--;
op2 = stack[sp];
stack[sp] = op2 - op1;
sp++;
break;
case OP_DIV :
sp--;
op2 = stack[sp];
sp--;
op1 = stack[sp];
if (op2 == 0.0)
stack[sp] = 1.0;
else
#if FAST_MATH
stack[sp] = __fdividef(op1, op2);
#else
stack[sp] = op1/op2;
#endif
sp++;
break;
#ifdef OP_SIN
case OP_SIN : // ----------
#if FAST_MATH
stack[sp-1] = __sinf(stack[sp-1]);
#else
stack[sp-1] = sinf(stack[sp-1]);
#endif
break;
#endif
#ifdef OP_COS
case OP_COS : // ----------
#if FAST_MATH
stack[sp-1] = __cosf(stack[sp-1]);
#else
stack[sp-1] = cosf(stack[sp-1]);
#endif
break;
#endif
#ifdef OP_EXP
case OP_EXP : // ----------
#if FAST_MATH
stack[sp-1] = __expf(stack[sp-1]);
#else
stack[sp-1] = expf(stack[sp-1]);
#endif
break;
#endif
#ifdef OP_LOG
case OP_LOG : // ----------
sp--;
op1 = stack[sp];
if (op1 == 0.0)
stack[sp] = 0.0;
else
#if FAST_MATH
stack[sp] = __logf(fabsf(op1));
#else
stack[sp] = logf(fabsf(op1));
#endif
#endif
sp++;
}
// get next codop
codop = k_progs[start_prog++];
} // codop interpret loop
result = fabsf(stack[0] - currentOutput);
if (!(result < BIG_NUMBER))
result = BIG_NUMBER;
else if (result < PROBABLY_ZERO)
result = 0.0;
if (result <= HIT_LEVEL)
hits++;
sum += result; // sum raw error on all training cases
} // LOOP ON TRAINING CASES
// gather results from all threads => we need to synchronize
tmpresult[tid] = sum;
tmphits[tid] = hits;
//__syncthreads();
if (tid == 0) {
for (int i = 1; i < NUMTHREAD2; i++) {
tmpresult[0] += tmpresult[i];
tmphits[0] += tmphits[i];
}
k_results[index] = tmpresult[0];
k_hits[index] = tmphits[0];
//printf("tid.y = %d k_results %d = %f",threadIdx.y,index,k_results[index]);
}
// here results and hits have been stored in their respective array: we can leave
}
......@@ -100,7 +100,6 @@ extern CEvolutionaryAlgorithm* EA;
\INSERT_BOUND_CHECKING
void evale_pop_chunk(CIndividual** population, int popSize){
printf("evalPopChunk\n");
\INSTEAD_EVAL_FUNCTION
}
......@@ -405,9 +404,13 @@ public:
\START_CUDA_MAKEFILE_TPL
NVCC=nvcc
EASEALIB_PATH=\EZ_PATHlibeasea/#/home/kruger/Bureau/Easea/libeasea/
CXXFLAGS = -O2 -g -Wall -fmessage-length=0 -I$(EASEALIB_PATH)include
CXXFLAGS = -g -I$(EASEALIB_PATH)include
\INSERT_MAKEFILE_OPTION#END OF USER MAKEFILE OPTIONS
OBJS = EASEA.o EASEAIndividual.o
......@@ -416,11 +419,11 @@ LIBS = -lboost_program_options
TARGET = EASEA
$(TARGET): $(OBJS)
$(CXX) -o $(TARGET) $(OBJS) $(LIBS) -g $(EASEALIB_PATH)libeasea.a
$(NVCC) -o $(TARGET) $(OBJS) $(LIBS) -g $(EASEALIB_PATH)libeasea.a
#%.o:%.cpp
# $(CXX) -c $(CXXFLAGS) $^
%.o:%.cu
$(NVCC) -c $(CXXFLAGS) $^ $(NVCC_OPT)
all: $(TARGET)
clean:
......@@ -597,7 +600,7 @@ easeaclean:
--plotStats=\PLOT_STATS #plot Stats with gnuplot (requires Gnuplot)
--printInitialPopulation=0 #Print initial population
--printFinalPopulation=0 #Print final population
--generateCVSFile=\GENERATE_CVS_FILE
--generateCSV=\GENERATE_CVS_FILE
--generateGnuplotScript=\GENERATE_GNUPLOT_SCRIPT
--generateRScript=\GENERATE_R_SCRIPT
\TEMPLATE_END
......@@ -597,7 +597,7 @@ easeaclean:
--plotStats=\PLOT_STATS #plot Stats with gnuplot (requires Gnuplot)
--printInitialPopulation=0 #Print initial population
--printFinalPopulation=0 #Print final population
--generateCVSFile=\GENERATE_CVS_FILE
--generateCSV=\GENERATE_CVS_FILE
--generateGnuplotScript=\GENERATE_GNUPLOT_SCRIPT
--generateRScript=\GENERATE_R_SCRIPT
\TEMPLATE_END
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment