Commit a622ff8c authored by Ogier Maitre's avatar Ogier Maitre

Release candidate 1.09

-CUDA.tpl corrected bugs
-Improve the user information, by dumping proper population size to distribute to current GPGPU system.
-Resolve a bug in GPU evaluation. Now evaluation are really done in parallel on all the cards, (it was serialized before).
-Add instrumented tools in CEvolutionaryAlgorithm.cpp (INSTRUMENTED must be defined to use this feature).
     -If timing.csv file does not already exist, a description of each field is outputted first in the file.
-It is now possible to specify a range of GPU to be used for evaluation computation at execution time. This is done using --fstgpu and --lstgpu arguments, that specify used gpus as [fstgpu,lstgpu[.
-Installer improved for linux and MacosX
parent c0c9e7f1
......@@ -2,19 +2,16 @@ UNAME := $(shell uname)
ifeq ($(shell uname -o 2>/dev/null),Msys)
OS := MINGW
endif
EXEC = easea
EXEC = bin/easea
CPPFLAGS += -DUNIX_OS -Ialexyacc/include/ -g -Wno-deprecated -DDEBUG -DLINE_NUM_EZ_FILE
CPPC = g++
LDFLAGS =
OBJ= build/EaseaSym.o build/EaseaParse.o build/EaseaLex.o alexyacc/libalex.a build/EaseaYTools.o boost/program_options.a libeasea/libeasea.a
#ifeq ($(UNAME),Darwin)
$(EXEC):EaseaSym.o EaseaParse.o EaseaLex.o alexyacc/libalex.a EaseaYTools.o boost/program_options.a libeasea/libeasea.a
#else
#$(EXEC):EaseaSym.o EaseaParse.o EaseaLex.o alexyacc/libalex.a EaseaYTools.o libeasea/libeasea.a
#endif
$(CPPC) $(CPPFLAGS) $(LDFLAGS) $^ -o $@
$(EXEC):build bin $(OBJ)
$(CPPC) $(CPPFLAGS) $(LDFLAGS) $(OBJ) -o $@
ifneq ("$(OS)","")
@echo #
@echo # Congratulations ! It looks like you compiled EASEA successfully.
......@@ -26,59 +23,42 @@ ifneq ("$(OS)","")
@echo #
@echo # Thanks for using EASEA.
@echo #
else ifeq ($(UNAME),Darwin)
#
# Congratulations ! It looks like you compiled EASEA successfully.
#
# EZ_PATH was automatically added to your .profile at the end of the compilation
#
# Easea could be moved to a bin directory or included in the PATH
# as long as users have defined a EZ_PATH environment variable
# pointing to the Easea directory.
# To do this temporally type :
# export EZ_PATH=`pwd`/
# Or define EZ_PATH in your .profile file :
# For example :
# export EZ_PATH=/path/to/easea/directory/
#
# Otherwise you can use easea from this directory by typing :
# For example :
# ./easea examples/weierstrass_std/weierstrass.ez
# Go to the taget directory and type make
#
# To Activate the EZ_PATH variable type:
# source ~/.profile
#
# Thanks for using EASEA.
#
else
#
# Congratulations ! It looks like you compiled EASEA successfully.
#
# Generated files depend on libboost-program-options,
# be sure that the development version of this library
# is installed on you system :
# For example, on ubuntu :
# sudo apt-get install libboost-program-options-dev
#
# EZ_PATH was automatically added to your .bashrc at the end of the compilation
# You can now install easea into your system or use it from
# its current directory.
#
# Easea could be moved to a bin directory or included in the PATH
# as long as users have defined a EZ_PATH environment variable
# pointing to the Easea directory.
# To do this temporally type :
# export EZ_PATH=`pwd`/
# Or define EZ_PATH in your bashrc file (for bash users) :
# For example :
# export EZ_PATH=/path/to/easea/directory/
# Installation:
# To install EASEA into your system, type:
# "sudo make install".
# EASEA will be installed into /usr/local/easa/ directory,
# including, the binary, its libraries and the templates.
# Finaly, environment variables will be updated (EZ_PATH and PATH),
ifeq ($(UNAME),Darwin)
# into your .bash_profile file.
else
# into your .bashrc file.
endif
#
# Otherwise you can use easea from this directory by typing :
# For example :
# ./easea examples/weierstrass_std/weierstrass.ez
# Go to the taget directory and type make
# Local Usage:
# All EASEA elements will stay in the current directory,
# but some environment variables need to be updated into your
ifeq ($(UNAME),Darwin)
# .bash_profile file (EZ_PATH and). To do so type:
else
# .bashrc file (EZ_PATH and). To do so type:
endif
# "make dev_vars".
#
# To Activate the EZ_PATH variable type:
# source ~/.profile
# Finally after having "install" or "dev_vars", reload bash config file
ifeq ($(UNAME),Darwin)
# (by "exec bash -l" or "source ~/.bash_profile", use easea with:
else
# (by "exec bash" or "source ~/.bashrc", use easea with:
endif
# easea weierstrass.ez
#
# Thanks for using EASEA.
#
......@@ -92,42 +72,52 @@ endif
# $(CPPC) $(CPPFLAGS) $(LDFLAGS) $^ -o $@ -lalex
install:
mkdir -p /usr/local/easea/ /usr/local/easea/bin /usr/local/easea/tpl /usr/local/easea/libeasea/include /usr/local/easea/boost
cp easea /usr/local/easea/bin/
install:vars
mkdir -p /usr/local/easea/ /usr/local/easea/bin /usr/local/easea/tpl /usr/local/easea/libeasea/include /usr/local/easea/boost /usr/local/easea/easeagrapher/
cp bin/easea /usr/local/easea/bin/
cp tpl/* /usr/local/easea/tpl/
cp libeasea/include/* /usr/local/easea/libeasea/include/
cp libeasea/libeasea.a /usr/local/easea/libeasea/
cp boost/program_options.a /usr/local/easea/boost
cp -r boost/boost/ /usr/local/easea/boost/boost/
cp easeagrapher/EaseaGrapher.jar /usr/local/easea/easeagrapher/
vars:
ifeq ($(UNAME), Darwin)
@sed '/EZ_PATH/d' $(HOME)/.profile>$(HOME)/.profile_save
@mv $(HOME)/.profile_save $(HOME)/.profile
@echo "export EZ_PATH=/usr/local/easea/">>$(HOME)/.profile
@echo "export PATH=\$$PATH:/usr/local/easea/bin" >>$(HOME)/.profile
@sed '/EZ_PATH/d' $(HOME)/.bash_profile>$(HOME)/.bash_profile_save
@mv $(HOME)/.bash_profile_save $(HOME)/.bash_profile
@echo "export EZ_PATH=/usr/local/easea/">>$(HOME)/.bash_profile
@echo "export PATH=\$$PATH:/usr/local/easea/bin:" >>$(HOME)/.bash_profile
else
@echo "\nexport EZ_PATH=/usr/local/easea/">>$(HOME)/.bashrc
@echo "export PATH=\$$PATH:/usr/local/easea/bin" >>$(HOME)/.bashrc
@echo "export PATH=\$$PATH:/usr/local/easea/bin:" >>$(HOME)/.bashrc
@echo "PATH and EZ_PATH variables have been set"
endif
build:
@test -d build || mkdir build || echo "Cannot make dir build"
bin:
@test -d bin || mkdir bin || echo "Cannot make dir bin"
dev_vars:
ifeq ($(UNAME), Darwin)
@echo "export EZ_PATH=$(PWD)/">>$(HOME)/.profile
@echo >> $(HOME)/.bash_profile
@echo "export EZ_PATH=$(PWD)/">>$(HOME)/.bash_profile
@echo "export PATH=\$$PATH:$(PWD)/bin/">>$(HOME)/.bash_profile
else
@echo "\nexport EZ_PATH=$(PWD)/">>$(HOME)/.bashrc
@echo >> $(HOME)/.bashrc
@echo "export EZ_PATH=$(PWD)/">>$(HOME)/.bashrc
@echo "export PATH=\$$PATH:$(PWD)/bin/">>$(HOME)/.bashrc
endif
EaseaParse.o: EaseaParse.cpp EaseaLex.cpp
$(CPPC) $(CPPFLAGS) $< -o $@ -c
EaseaLex.o: EaseaLex.cpp
$(CPPC) $(CPPFLAGS) $< -o $@ -c
build/EaseaParse.o: EaseaParse.cpp EaseaLex.cpp
$(CPPC) $(CPPFLAGS) $< -o $@ -c -w
build/EaseaLex.o: EaseaLex.cpp
$(CPPC) $(CPPFLAGS) $< -o $@ -c -w
%.o:%.cpp
build/%.o:%.cpp
$(CPPC) $(CPPFLAGS) -c -o $@ $<
#compile library for alex and ayacc unix version
......@@ -153,7 +143,7 @@ ifneq ("$(OS)","")
cd libeasea && make clean
cd boost && make clean
else
rm -f *.o $(EXEC) $(EXEC)_bin
rm -f build/*.o $(EXEC) $(EXEC)_bin
cd alexyacc && make clean
cd libeasea && make clean
cd boost && make clean
......
......@@ -19,11 +19,9 @@ __________________________________________________________*/
float pMutPerGene=0.1;
\end
\User functions:
//fitness function
#include <math.h>
__device__ __host__ inline static float SQR(float d)
......@@ -63,7 +61,7 @@ float gauss()
/* Generates a normally distributed random value with variance 1 and 0 mean.
Algorithm based on "gasdev" from Numerical recipes' pg. 203. */
{
int iset = 0;
static int iset = 0;
float gset = 0.0;
float v1 = 0.0, v2 = 0.0, r = 0.0;
float factor = 0.0;
......@@ -87,18 +85,20 @@ float gauss()
}
\end
\User CUDA:
\end
\Before everything else function:
//cout<<"Before everything else function called "<<endl;
{
}
\end
\After everything else function:
//cout << "After everything else function called" << endl;
\end
\At the beginning of each generation function:
//cout << "At the beginning of each generation function called" << endl;
\At the beginning of each generation function:{
}
\end
\At the end of each generation function:
......@@ -147,8 +147,8 @@ GenomeClass {
if (tossCoin(pMutPerGene)){
NbMut++;
Genome.sigma[i] = Genome.sigma[i] * exp(SIGMA*pond*(float)gauss());
Genome.sigma[i] = MIN(0.5,Genome.sigma[0]);
Genome.sigma[i] = MAX(0.,Genome.sigma[0]);
Genome.sigma[i] = MIN(0.5,Genome.sigma[i]);
Genome.sigma[i] = MAX(0.,Genome.sigma[i]);
Genome.x[i] += Genome.sigma[i]*(float)gauss();
Genome.x[i] = MIN(X_MAX,Genome.x[i]); // pour eviter les depassements
Genome.x[i] = MAX(X_MIN,Genome.x[i]);
......@@ -157,10 +157,12 @@ return NbMut;
\end
\GenomeClass::evaluator : // Returns the score
{
float Score= 0.0;
Score= Weierstrass(Genome.x, SIZE);
//Score= rosenbrock(Genome.x);
return Score;
}
\end
\User Makefile options:
......
#include <math.h>
#include <stdlib.h>
#include "include/CCuda.h"
#include <stdio.h>
CCuda::CCuda(unsigned parentSize, unsigned offSize, unsigned individualImplSize){
this->sizeOfIndividualImpl = individualImplSize;
this->cudaBuffer = (void*)malloc(this->sizeOfIndividualImpl*( (parentSize>offSize) ? parentSize : offSize));
}
CCuda::~CCuda(){
}
bool repartition(struct my_struct_gpu* gpu_infos){
//There is an implied minimum number of threads for each block
if(gpu_infos->num_Warp > gpu_infos->num_thread_max){
printf("You need to authorized at least %d threads on each block!\n",gpu_infos->num_Warp);
exit(1);
}
gpu_infos->dimGrid = gpu_infos->num_MP;
gpu_infos->dimBlock = gpu_infos->num_Warp;;
//While each element of the population can't be placed on the card
while(gpu_infos->dimBlock * gpu_infos->dimGrid < gpu_infos->sh_pop_size) {
//Every time we add the number of Warp to the value of dimBlock
if( (gpu_infos->dimBlock += gpu_infos->num_Warp) > gpu_infos->num_thread_max ) {
//If the number of dimBlock exceeds the number of threads max, we add the number of MP to the value of dimGrid and we reset the value of dimBlock with the number of Warp
gpu_infos->dimGrid += gpu_infos->num_MP;
gpu_infos->dimBlock = gpu_infos->num_Warp;
}
}
//Verification that we have enough place for all the population and that every constraints are respected
if( (gpu_infos->dimBlock*gpu_infos->dimGrid >= gpu_infos->sh_pop_size) && (gpu_infos->dimBlock <= gpu_infos->num_thread_max))
return true;
else
return false;
}
......@@ -30,6 +30,16 @@
#include <sstream>
#include <fstream>
//#define INSTRUMENTED
#ifdef INSTRUMENTED
#define TIMING
#include <timing.h>
#else
#define TIME_ST(f)
#define TIME_END(f)
#define TIME_ACC(f)
#endif
using namespace std;
extern CRandomGenerator* globalRandomGenerator;
......@@ -165,9 +175,30 @@ void CEvolutionaryAlgorithm::runEvolutionaryLoop(){
gettimeofday(&begin,0);
#endif
#ifdef INSTRUMENTED
const char* timing_file_name = "timing.csv";
FILE* timing_file = NULL;
if( access(timing_file_name,W_OK)!=0 ){
// if file does not already exist, start by describing each field
timing_file = fopen("timing.csv","w");
fprintf(timing_file,"gen,popSize,init,eval,breeding,reduction\n");
}
else{
timing_file = fopen("timing.csv","a");
}
DECLARE_TIME(init);
DECLARE_TIME_ACC(eval);
//DECLARE_TIME_ACC(optim);
DECLARE_TIME_ACC(breeding);
DECLARE_TIME_ACC(reduction);
#endif
std::cout << "Population initialisation (Generation 0)... "<< std::endl;
this->initializeParentPopulation();
TIME_ST(init);this->initializeParentPopulation();TIME_END(init);
TIME_ST(eval);
if(!INSTEAD_EVAL_STEP)
this->population->evaluateParentPopulation();
else
......@@ -176,6 +207,8 @@ void CEvolutionaryAlgorithm::runEvolutionaryLoop(){
if(this->params->optimise){
population->optimiseParentPopulation();
}
TIME_END(eval);
TIME_ACC(eval);
this->population->currentEvaluationNb += this->params->parentPopulationSize;
if(this->params->printInitialPopulation){
......@@ -198,9 +231,12 @@ void CEvolutionaryAlgorithm::runEvolutionaryLoop(){
// Sending individuals if remote island model
if(params->remoteIslandModel && this->numberOfClients>0)
this->sendIndividual();
TIME_ST(breeding);
population->produceOffspringPopulation();
TIME_END(breeding);
TIME_ACC(breeding);
TIME_ST(eval);
if(!INSTEAD_EVAL_STEP)
population->evaluateOffspringPopulation();
else
......@@ -210,6 +246,8 @@ void CEvolutionaryAlgorithm::runEvolutionaryLoop(){
if(this->params->optimise){
population->optimiseOffspringPopulation();
}
TIME_END(eval);
TIME_ACC(eval);
EASEAGenerationFunctionBeforeReplacement(this);
......@@ -227,6 +265,7 @@ void CEvolutionaryAlgorithm::runEvolutionaryLoop(){
}
TIME_ST(reduction);
if( params->parentReduction )
population->reduceParentPopulation(params->parentReductionSize);
......@@ -234,6 +273,8 @@ void CEvolutionaryAlgorithm::runEvolutionaryLoop(){
population->reduceOffspringPopulation( params->offspringReductionSize );
population->reduceTotalPopulation(elitistPopulation);
TIME_END(reduction);
TIME_ACC(reduction);
population->sortParentPopulation();
//if( this->params->printStats || this->params->generateCSVFile )
......@@ -281,6 +322,17 @@ void CEvolutionaryAlgorithm::runEvolutionaryLoop(){
if(this->params->plotStats){
delete this->grapher;
}
#ifdef INSTRUMENTED
COMPUTE_TIME(init);
fprintf(timing_file,"%d,%d,%ld.%06ld,%ld.%06ld,%ld.%06ld,%ld.%06ld\n",
currentGeneration, population->parentPopulationSize,
init_res.tv_sec,init_res.tv_usec,
eval_acc.tv_sec,eval_acc.tv_usec,
breeding_acc.tv_sec,breeding_acc.tv_usec,
reduction_acc.tv_sec,reduction_acc.tv_usec);
fclose(timing_file);
#endif
}
......
......@@ -156,6 +156,8 @@ void parseArguments(const char* parametersFileName, int ac, char** av,
("printFinalPopulation",po::value<int>(),"Prints the final population (default : 0)")
("savePopulation",po::value<int>(),"Saves population at the end (default : 0)")
("startFromFile",po::value<int>(),"Loads the population from a .pop file (default : 0")
("fstgpu",po::value<int>(),"The number of the first GPU used for computation")
("lstgpu",po::value<int>(),"The number of the fisrt GPU NOT used for computation")
("u1",po::value<string>(),"User defined parameter 1")
("u2",po::value<string>(),"User defined parameter 2")
("u3",po::value<int>(),"User defined parameter 3")
......
......@@ -8,7 +8,7 @@ CXXFLAGS = -g -Wall -fmessage-length=0 #-I../boost/
OBJS = CRandomGenerator.o CSelectionOperator.o CEvolutionaryAlgorithm.o\
CStoppingCriterion.o COptionParser.o CPopulation.o CIndividual.o\
CGrapher.o CCmaes.o CCmaesCuda.o Parameters.o CGPNode.o\
CComUDPLayer.o CCuda.o CStats.o
CComUDPLayer.o CStats.o
ifneq ("$(OS)","")
OBJS += inet_pton.o
......@@ -26,7 +26,7 @@ TARGET = libeasea.a
ifneq ("$(OS)","")
CPPFLAGS=-I..\boost\ #-pg
else
CPPFLAGS=-I/usr/local/cuda/include/ -I../boost/ #-pg
CPPFLAGS=-I../boost/ #-pg
endif
$(TARGET): $(OBJS)
......
/*
* CCuda.h
*
* Created on: 23 juin 2009
* Author: maitre
*/
#ifndef CCUDA_H_
#define CCUDA_H_
#include <iostream>
#include <semaphore.h>
//#include <cuda_runtime_api.h>
#define CUDA_SAFE_CALL(f) \
{ \
cudaError_t err; \
err = f; \
if( err != cudaSuccess ){ \
printf("Error : %s\n",cudaGetErrorString(err)); \
exit(-1); \
} \
}
struct gpuOptions{};
struct my_struct_gpu{
int indiv_start;
int sh_pop_size;
int num_MP;
int num_thread_max;
int num_Warp;
int dimGrid;
int dimBlock;
};
struct gpuArg{
int threadId;
sem_t sem_in;
sem_t sem_out;
void* d_population;
float* d_fitness;
};
class CCuda {
public:
void* cudaBuffer;
unsigned sizeOfIndividualImpl;
struct gpuOptions initOpts;
public:
CCuda(unsigned parentSize, unsigned offSize, unsigned individualImplSize);
~CCuda();
};
bool repartition(struct my_struct_gpu* gpu_infos);
#endif /* CCUDA_H_ */
......@@ -78,6 +78,9 @@ public:
char* outputFilename;
char* plotOutputFilename;
int fstGpu;
int lstGpu;
public:
#ifdef WIN32
Parameters();
......
eTEMPLATE_START
\TEMPLATE_START
#ifdef WIN32
#define _CRT_SECURE_NO_WARNINGS
#pragma comment(lib, "libEasea.lib")
......@@ -78,7 +78,6 @@ int main(int argc, char** argv){
#include "CEvolutionaryAlgorithm.h"
#include "global.h"
#include "CIndividual.h"
#include "CCuda.h"
#include <vector_types.h>
......@@ -94,6 +93,9 @@ extern CEvolutionaryAlgorithm *EA;
struct gpuArg* gpuArgs;
int fstGpu = 0;
int lstGpu = 0;
struct my_struct_gpu* gpu_infos;
float* fitnessTemp;
......@@ -111,41 +113,38 @@ PopulationImpl* Pop = NULL;
\INSERT_USER_FUNCTIONS
void cudaPreliminaryProcess(unsigned PopulationSize){
int capacite_max = 0;
int noTotalMP = 0; // number of MP will be used to distribute the population
int count = 0;
//Recuperation of each device information's.
for( int index = 0; index < num_gpus; index++){
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, index);
cudaError_t lastError = cudaGetDeviceProperties(&deviceProp, index+fstGpu);
if( lastError!=cudaSuccess ){
std::cerr << "Cannot get device information for device no : " << index+fstGpu << std::endl;
exit(-1);
}
gpu_infos[index].num_MP = deviceProp.multiProcessorCount*2; //Two block on each MP
gpu_infos[index].num_MP = deviceProp.multiProcessorCount; //Two block on each MP
gpu_infos[index].num_thread_max = deviceProp.maxThreadsPerBlock*0.5; //We are going to use 50% of the real maximun thread per block, we want to be sure to have enough memory for all of them.
gpu_infos[index].num_Warp = deviceProp.warpSize;
capacite_max += gpu_infos[index].num_MP * gpu_infos[index].num_thread_max;
}
noTotalMP += gpu_infos[index].num_MP;
gpu_infos[index].gpuProp = deviceProp;
int count = 0;
//We can have different cards that's why we are going to put more or less individuals on each of them according to their respective capacity.
}
for( int index = 0; index < num_gpus; index++){
gpu_infos[index].indiv_start = count;
//On the first cards we are going to place a maximun of individuals.
if(index != (num_gpus - 1))
gpu_infos[index].sh_pop_size = ceil((float)PopulationSize * (((float)gpu_infos[index].num_MP*(float)gpu_infos[index].num_thread_max) / (float)capacite_max) );
gpu_infos[index].sh_pop_size = ceil((float)PopulationSize * (((float)gpu_infos[index].num_MP) / (float)noTotalMP) );
//On the last card we are going to place the remaining individuals.
else
gpu_infos[index].sh_pop_size = PopulationSize - count;
count += gpu_infos[index].sh_pop_size;
/*
* The number of thread will be a multiple of the number of Warp less than or equal at the maximun number of thread per block.
* The number of block will be a multiple of the double of MP.
*/
if( !repartition(&gpu_infos[index]))
exit( -1 );
std::cout << "Device number : " << index << " Number of block : " << gpu_infos[index].dimGrid << std::endl;
std::cout << "Device number : " << index << " Number of thread : " << gpu_infos[index].dimBlock << std::endl;
}
}
......@@ -158,6 +157,7 @@ __device__ float cudaEvaluate(void* devBuffer, unsigned id, struct gpuOptions in
}
extern "C"
__global__ void cudaEvaluatePopulation(void* d_population, unsigned popSize, float* d_fitnesses, struct gpuOptions initOpts){
unsigned id = (blockDim.x*blockIdx.x)+threadIdx.x; // id of the individual computed by this thread
......@@ -175,9 +175,25 @@ void* gpuThreadMain(void* arg){
cudaError_t lastError;
struct gpuArg* localArg = (struct gpuArg*)arg;
cudaSetDevice(localArg->threadId);
std::cout << " gpuId : " << localArg->gpuId << std::endl;
lastError = cudaSetDevice(localArg->gpuId);
if( lastError != cudaSuccess ){
std::cerr << "Error, cannot set device properly for device no " << localArg->gpuId << std::endl;
exit(-1);
}
int nbr_cudaPreliminaryProcess = 2;
struct my_struct_gpu* localGpuInfo = gpu_infos+localArg->threadId;
struct cudaFuncAttributes attr;
lastError = cudaFuncGetAttributes(&attr,"cudaEvaluatePopulation");
if( lastError != cudaSuccess ){
std::cerr << "Error, cannot get function attribute for cudaEvaluatePopulation on card: " << localGpuInfo->gpuProp.name << std::endl;
exit(-1);
}
// Because of the context of each GPU thread, we have to put all user's CUDA
// initialisation here if we want to use them in the GPU, otherwise they are
// not found in the GPU context
......@@ -187,22 +203,61 @@ void* gpuThreadMain(void* arg){
while(1){
sem_wait(&localArg->sem_in);
if( freeGPU ) {
// do we need to free gpu memory
cudaFree(localArg->d_fitness);
cudaFree(localArg->d_population);
break;
}
if(nbr_cudaPreliminaryProcess > 0) {
lastError = cudaMalloc(&localArg->d_population,gpu_infos[localArg->threadId].sh_pop_size*(sizeof(IndividualImpl)));
lastError = cudaMalloc(((void**)&localArg->d_fitness),gpu_infos[localArg->threadId].sh_pop_size*sizeof(float));
// we should free GPU buffers when
// here we will compute how to spread the population to evaluate on GPGPU cores
int thLimit = attr.maxThreadsPerBlock;
int N = localGpuInfo->sh_pop_size;
int w = localGpuInfo->gpuProp.warpSize;
int b=0,t=0;
do{
b += localGpuInfo->num_MP;
t = ceilf( MIN(thLimit,(float)N/b)/w)*w;
} while( (b*t<N) || t>thLimit );
if( localArg->d_population!=NULL ){ cudaFree(localArg->d_population); }
if( localArg->d_fitness!=NULL ){ cudaFree(localArg->d_fitness); }