From a622ff8cb90ae732544dc7504861ce5784b5f806 Mon Sep 17 00:00:00 2001 From: Ogier Maitre Date: Tue, 17 Apr 2012 16:03:24 +0200 Subject: [PATCH] Release candidate 1.09 -CUDA.tpl corrected bugs -Improve the user information, by dumping proper population size to distribute to current GPGPU system. -Resolve a bug in GPU evaluation. Now evaluation are really done in parallel on all the cards, (it was serialized before). -Add instrumented tools in CEvolutionaryAlgorithm.cpp (INSTRUMENTED must be defined to use this feature). -If timing.csv file does not already exist, a description of each field is outputted first in the file. -It is now possible to specify a range of GPU to be used for evaluation computation at execution time. This is done using --fstgpu and --lstgpu arguments, that specify used gpus as [fstgpu,lstgpu[. -Installer improved for linux and MacosX --- Makefile | 128 ++++++++--------- examples/weierstrass/weierstrass.ez | 20 +-- libeasea/CCuda.cpp | 45 ------ libeasea/CEvolutionaryAlgorithm.cpp | 56 +++++++- libeasea/COptionParser.cpp | 2 + libeasea/Makefile | 4 +- libeasea/include/CCuda.h | 64 --------- libeasea/include/Parameters.h | 3 + tpl/CUDA.tpl | 210 +++++++++++++++++++--------- 9 files changed, 278 insertions(+), 254 deletions(-) delete mode 100755 libeasea/CCuda.cpp delete mode 100755 libeasea/include/CCuda.h diff --git a/Makefile b/Makefile index 8b39a23..cdbbffb 100755 --- a/Makefile +++ b/Makefile @@ -2,19 +2,16 @@ UNAME := $(shell uname) ifeq ($(shell uname -o 2>/dev/null),Msys) OS := MINGW endif -EXEC = easea +EXEC = bin/easea CPPFLAGS += -DUNIX_OS -Ialexyacc/include/ -g -Wno-deprecated -DDEBUG -DLINE_NUM_EZ_FILE CPPC = g++ LDFLAGS = - +OBJ= build/EaseaSym.o build/EaseaParse.o build/EaseaLex.o alexyacc/libalex.a build/EaseaYTools.o boost/program_options.a libeasea/libeasea.a #ifeq ($(UNAME),Darwin) -$(EXEC):EaseaSym.o EaseaParse.o EaseaLex.o alexyacc/libalex.a EaseaYTools.o boost/program_options.a libeasea/libeasea.a -#else -#$(EXEC):EaseaSym.o EaseaParse.o EaseaLex.o alexyacc/libalex.a EaseaYTools.o libeasea/libeasea.a -#endif - $(CPPC) $(CPPFLAGS) $(LDFLAGS) $^ -o $@ +$(EXEC):build bin $(OBJ) + $(CPPC) $(CPPFLAGS) $(LDFLAGS) $(OBJ) -o $@ ifneq ("$(OS)","") @echo # @echo # Congratulations ! It looks like you compiled EASEA successfully. @@ -26,59 +23,42 @@ ifneq ("$(OS)","") @echo # @echo # Thanks for using EASEA. @echo # -else ifeq ($(UNAME),Darwin) +else # # Congratulations ! It looks like you compiled EASEA successfully. # - # EZ_PATH was automatically added to your .profile at the end of the compilation - # - # Easea could be moved to a bin directory or included in the PATH - # as long as users have defined a EZ_PATH environment variable - # pointing to the Easea directory. - # To do this temporally type : - # export EZ_PATH=`pwd`/ - # Or define EZ_PATH in your .profile file : - # For example : - # export EZ_PATH=/path/to/easea/directory/ - # - # Otherwise you can use easea from this directory by typing : - # For example : - # ./easea examples/weierstrass_std/weierstrass.ez - # Go to the taget directory and type make - # - # To Activate the EZ_PATH variable type: - # source ~/.profile - # - # Thanks for using EASEA. + # You can now install easea into your system or use it from + # its current directory. # + # Installation: + # To install EASEA into your system, type: + # "sudo make install". + # EASEA will be installed into /usr/local/easa/ directory, + # including, the binary, its libraries and the templates. + # Finaly, environment variables will be updated (EZ_PATH and PATH), +ifeq ($(UNAME),Darwin) + # into your .bash_profile file. else - # - # Congratulations ! It looks like you compiled EASEA successfully. + # into your .bashrc file. +endif # - # Generated files depend on libboost-program-options, - # be sure that the development version of this library - # is installed on you system : - # For example, on ubuntu : - # sudo apt-get install libboost-program-options-dev - # - # EZ_PATH was automatically added to your .bashrc at the end of the compilation - # - # Easea could be moved to a bin directory or included in the PATH - # as long as users have defined a EZ_PATH environment variable - # pointing to the Easea directory. - # To do this temporally type : - # export EZ_PATH=`pwd`/ - # Or define EZ_PATH in your bashrc file (for bash users) : - # For example : - # export EZ_PATH=/path/to/easea/directory/ - # - # Otherwise you can use easea from this directory by typing : - # For example : - # ./easea examples/weierstrass_std/weierstrass.ez - # Go to the taget directory and type make + # Local Usage: + # All EASEA elements will stay in the current directory, + # but some environment variables need to be updated into your +ifeq ($(UNAME),Darwin) + # .bash_profile file (EZ_PATH and). To do so type: +else + # .bashrc file (EZ_PATH and). To do so type: +endif + # "make dev_vars". # - # To Activate the EZ_PATH variable type: - # source ~/.profile + # Finally after having "install" or "dev_vars", reload bash config file +ifeq ($(UNAME),Darwin) + # (by "exec bash -l" or "source ~/.bash_profile", use easea with: +else + # (by "exec bash" or "source ~/.bashrc", use easea with: +endif + # easea weierstrass.ez # # Thanks for using EASEA. # @@ -92,42 +72,52 @@ endif # $(CPPC) $(CPPFLAGS) $(LDFLAGS) $^ -o $@ -lalex -install: - mkdir -p /usr/local/easea/ /usr/local/easea/bin /usr/local/easea/tpl /usr/local/easea/libeasea/include /usr/local/easea/boost - cp easea /usr/local/easea/bin/ +install:vars + mkdir -p /usr/local/easea/ /usr/local/easea/bin /usr/local/easea/tpl /usr/local/easea/libeasea/include /usr/local/easea/boost /usr/local/easea/easeagrapher/ + cp bin/easea /usr/local/easea/bin/ cp tpl/* /usr/local/easea/tpl/ cp libeasea/include/* /usr/local/easea/libeasea/include/ cp libeasea/libeasea.a /usr/local/easea/libeasea/ cp boost/program_options.a /usr/local/easea/boost cp -r boost/boost/ /usr/local/easea/boost/boost/ + cp easeagrapher/EaseaGrapher.jar /usr/local/easea/easeagrapher/ vars: ifeq ($(UNAME), Darwin) - @sed '/EZ_PATH/d' $(HOME)/.profile>$(HOME)/.profile_save - @mv $(HOME)/.profile_save $(HOME)/.profile - @echo "export EZ_PATH=/usr/local/easea/">>$(HOME)/.profile - @echo "export PATH=\$$PATH:/usr/local/easea/bin" >>$(HOME)/.profile + @sed '/EZ_PATH/d' $(HOME)/.bash_profile>$(HOME)/.bash_profile_save + @mv $(HOME)/.bash_profile_save $(HOME)/.bash_profile + @echo "export EZ_PATH=/usr/local/easea/">>$(HOME)/.bash_profile + @echo "export PATH=\$$PATH:/usr/local/easea/bin:" >>$(HOME)/.bash_profile else @echo "\nexport EZ_PATH=/usr/local/easea/">>$(HOME)/.bashrc - @echo "export PATH=\$$PATH:/usr/local/easea/bin" >>$(HOME)/.bashrc + @echo "export PATH=\$$PATH:/usr/local/easea/bin:" >>$(HOME)/.bashrc @echo "PATH and EZ_PATH variables have been set" endif +build: + @test -d build || mkdir build || echo "Cannot make dir build" +bin: + @test -d bin || mkdir bin || echo "Cannot make dir bin" + dev_vars: ifeq ($(UNAME), Darwin) - @echo "export EZ_PATH=$(PWD)/">>$(HOME)/.profile + @echo >> $(HOME)/.bash_profile + @echo "export EZ_PATH=$(PWD)/">>$(HOME)/.bash_profile + @echo "export PATH=\$$PATH:$(PWD)/bin/">>$(HOME)/.bash_profile else - @echo "\nexport EZ_PATH=$(PWD)/">>$(HOME)/.bashrc + @echo >> $(HOME)/.bashrc + @echo "export EZ_PATH=$(PWD)/">>$(HOME)/.bashrc + @echo "export PATH=\$$PATH:$(PWD)/bin/">>$(HOME)/.bashrc endif -EaseaParse.o: EaseaParse.cpp EaseaLex.cpp - $(CPPC) $(CPPFLAGS) $< -o $@ -c -EaseaLex.o: EaseaLex.cpp - $(CPPC) $(CPPFLAGS) $< -o $@ -c +build/EaseaParse.o: EaseaParse.cpp EaseaLex.cpp + $(CPPC) $(CPPFLAGS) $< -o $@ -c -w +build/EaseaLex.o: EaseaLex.cpp + $(CPPC) $(CPPFLAGS) $< -o $@ -c -w -%.o:%.cpp +build/%.o:%.cpp $(CPPC) $(CPPFLAGS) -c -o $@ $< #compile library for alex and ayacc unix version @@ -153,7 +143,7 @@ ifneq ("$(OS)","") cd libeasea && make clean cd boost && make clean else - rm -f *.o $(EXEC) $(EXEC)_bin + rm -f build/*.o $(EXEC) $(EXEC)_bin cd alexyacc && make clean cd libeasea && make clean cd boost && make clean diff --git a/examples/weierstrass/weierstrass.ez b/examples/weierstrass/weierstrass.ez index 6220858..d6299eb 100755 --- a/examples/weierstrass/weierstrass.ez +++ b/examples/weierstrass/weierstrass.ez @@ -19,11 +19,9 @@ __________________________________________________________*/ float pMutPerGene=0.1; - \end \User functions: -//fitness function #include __device__ __host__ inline static float SQR(float d) @@ -63,7 +61,7 @@ float gauss() /* Generates a normally distributed random value with variance 1 and 0 mean. Algorithm based on "gasdev" from Numerical recipes' pg. 203. */ { - int iset = 0; + static int iset = 0; float gset = 0.0; float v1 = 0.0, v2 = 0.0, r = 0.0; float factor = 0.0; @@ -87,18 +85,20 @@ float gauss() } \end - +\User CUDA: +\end \Before everything else function: - //cout<<"Before everything else function called "< -#include -#include "include/CCuda.h" -#include - - -CCuda::CCuda(unsigned parentSize, unsigned offSize, unsigned individualImplSize){ - this->sizeOfIndividualImpl = individualImplSize; - this->cudaBuffer = (void*)malloc(this->sizeOfIndividualImpl*( (parentSize>offSize) ? parentSize : offSize)); -} - -CCuda::~CCuda(){ -} - -bool repartition(struct my_struct_gpu* gpu_infos){ - - //There is an implied minimum number of threads for each block - if(gpu_infos->num_Warp > gpu_infos->num_thread_max){ - printf("You need to authorized at least %d threads on each block!\n",gpu_infos->num_Warp); - exit(1); - } - - gpu_infos->dimGrid = gpu_infos->num_MP; - gpu_infos->dimBlock = gpu_infos->num_Warp;; - - //While each element of the population can't be placed on the card - while(gpu_infos->dimBlock * gpu_infos->dimGrid < gpu_infos->sh_pop_size) { - //Every time we add the number of Warp to the value of dimBlock - if( (gpu_infos->dimBlock += gpu_infos->num_Warp) > gpu_infos->num_thread_max ) { - //If the number of dimBlock exceeds the number of threads max, we add the number of MP to the value of dimGrid and we reset the value of dimBlock with the number of Warp - gpu_infos->dimGrid += gpu_infos->num_MP; - gpu_infos->dimBlock = gpu_infos->num_Warp; - } - } - - - //Verification that we have enough place for all the population and that every constraints are respected - if( (gpu_infos->dimBlock*gpu_infos->dimGrid >= gpu_infos->sh_pop_size) && (gpu_infos->dimBlock <= gpu_infos->num_thread_max)) - return true; - else - return false; -} - - - diff --git a/libeasea/CEvolutionaryAlgorithm.cpp b/libeasea/CEvolutionaryAlgorithm.cpp index 2d2aeaa..043f8ef 100755 --- a/libeasea/CEvolutionaryAlgorithm.cpp +++ b/libeasea/CEvolutionaryAlgorithm.cpp @@ -30,6 +30,16 @@ #include #include +//#define INSTRUMENTED +#ifdef INSTRUMENTED +#define TIMING +#include +#else +#define TIME_ST(f) +#define TIME_END(f) +#define TIME_ACC(f) +#endif + using namespace std; extern CRandomGenerator* globalRandomGenerator; @@ -165,9 +175,30 @@ void CEvolutionaryAlgorithm::runEvolutionaryLoop(){ gettimeofday(&begin,0); #endif +#ifdef INSTRUMENTED + const char* timing_file_name = "timing.csv"; + FILE* timing_file = NULL; + if( access(timing_file_name,W_OK)!=0 ){ + // if file does not already exist, start by describing each field + timing_file = fopen("timing.csv","w"); + fprintf(timing_file,"gen,popSize,init,eval,breeding,reduction\n"); + } + else{ + timing_file = fopen("timing.csv","a"); + } + DECLARE_TIME(init); + DECLARE_TIME_ACC(eval); + //DECLARE_TIME_ACC(optim); + DECLARE_TIME_ACC(breeding); + DECLARE_TIME_ACC(reduction); + +#endif + std::cout << "Population initialisation (Generation 0)... "<< std::endl; - this->initializeParentPopulation(); + TIME_ST(init);this->initializeParentPopulation();TIME_END(init); + + TIME_ST(eval); if(!INSTEAD_EVAL_STEP) this->population->evaluateParentPopulation(); else @@ -176,6 +207,8 @@ void CEvolutionaryAlgorithm::runEvolutionaryLoop(){ if(this->params->optimise){ population->optimiseParentPopulation(); } + TIME_END(eval); + TIME_ACC(eval); this->population->currentEvaluationNb += this->params->parentPopulationSize; if(this->params->printInitialPopulation){ @@ -198,9 +231,12 @@ void CEvolutionaryAlgorithm::runEvolutionaryLoop(){ // Sending individuals if remote island model if(params->remoteIslandModel && this->numberOfClients>0) this->sendIndividual(); - + TIME_ST(breeding); population->produceOffspringPopulation(); + TIME_END(breeding); + TIME_ACC(breeding); + TIME_ST(eval); if(!INSTEAD_EVAL_STEP) population->evaluateOffspringPopulation(); else @@ -210,6 +246,8 @@ void CEvolutionaryAlgorithm::runEvolutionaryLoop(){ if(this->params->optimise){ population->optimiseOffspringPopulation(); } + TIME_END(eval); + TIME_ACC(eval); EASEAGenerationFunctionBeforeReplacement(this); @@ -227,6 +265,7 @@ void CEvolutionaryAlgorithm::runEvolutionaryLoop(){ } + TIME_ST(reduction); if( params->parentReduction ) population->reduceParentPopulation(params->parentReductionSize); @@ -234,6 +273,8 @@ void CEvolutionaryAlgorithm::runEvolutionaryLoop(){ population->reduceOffspringPopulation( params->offspringReductionSize ); population->reduceTotalPopulation(elitistPopulation); + TIME_END(reduction); + TIME_ACC(reduction); population->sortParentPopulation(); //if( this->params->printStats || this->params->generateCSVFile ) @@ -281,6 +322,17 @@ void CEvolutionaryAlgorithm::runEvolutionaryLoop(){ if(this->params->plotStats){ delete this->grapher; } + +#ifdef INSTRUMENTED + COMPUTE_TIME(init); + fprintf(timing_file,"%d,%d,%ld.%06ld,%ld.%06ld,%ld.%06ld,%ld.%06ld\n", + currentGeneration, population->parentPopulationSize, + init_res.tv_sec,init_res.tv_usec, + eval_acc.tv_sec,eval_acc.tv_usec, + breeding_acc.tv_sec,breeding_acc.tv_usec, + reduction_acc.tv_sec,reduction_acc.tv_usec); + fclose(timing_file); +#endif } diff --git a/libeasea/COptionParser.cpp b/libeasea/COptionParser.cpp index 0006e60..e86de28 100755 --- a/libeasea/COptionParser.cpp +++ b/libeasea/COptionParser.cpp @@ -156,6 +156,8 @@ void parseArguments(const char* parametersFileName, int ac, char** av, ("printFinalPopulation",po::value(),"Prints the final population (default : 0)") ("savePopulation",po::value(),"Saves population at the end (default : 0)") ("startFromFile",po::value(),"Loads the population from a .pop file (default : 0") + ("fstgpu",po::value(),"The number of the first GPU used for computation") + ("lstgpu",po::value(),"The number of the fisrt GPU NOT used for computation") ("u1",po::value(),"User defined parameter 1") ("u2",po::value(),"User defined parameter 2") ("u3",po::value(),"User defined parameter 3") diff --git a/libeasea/Makefile b/libeasea/Makefile index cb34965..34130ba 100755 --- a/libeasea/Makefile +++ b/libeasea/Makefile @@ -8,7 +8,7 @@ CXXFLAGS = -g -Wall -fmessage-length=0 #-I../boost/ OBJS = CRandomGenerator.o CSelectionOperator.o CEvolutionaryAlgorithm.o\ CStoppingCriterion.o COptionParser.o CPopulation.o CIndividual.o\ CGrapher.o CCmaes.o CCmaesCuda.o Parameters.o CGPNode.o\ - CComUDPLayer.o CCuda.o CStats.o + CComUDPLayer.o CStats.o ifneq ("$(OS)","") OBJS += inet_pton.o @@ -26,7 +26,7 @@ TARGET = libeasea.a ifneq ("$(OS)","") CPPFLAGS=-I..\boost\ #-pg else - CPPFLAGS=-I/usr/local/cuda/include/ -I../boost/ #-pg + CPPFLAGS=-I../boost/ #-pg endif $(TARGET): $(OBJS) diff --git a/libeasea/include/CCuda.h b/libeasea/include/CCuda.h deleted file mode 100755 index ad7cde6..0000000 --- a/libeasea/include/CCuda.h +++ /dev/null @@ -1,64 +0,0 @@ -/* - * CCuda.h - * - * Created on: 23 juin 2009 - * Author: maitre - */ - -#ifndef CCUDA_H_ -#define CCUDA_H_ - -#include -#include -//#include - - - -#define CUDA_SAFE_CALL(f) \ - { \ - cudaError_t err; \ - err = f; \ - if( err != cudaSuccess ){ \ - printf("Error : %s\n",cudaGetErrorString(err)); \ - exit(-1); \ - } \ - } - - -struct gpuOptions{}; - -struct my_struct_gpu{ - int indiv_start; - int sh_pop_size; - - int num_MP; - int num_thread_max; - int num_Warp; - - int dimGrid; - int dimBlock; -}; - -struct gpuArg{ - int threadId; - sem_t sem_in; - sem_t sem_out; - - void* d_population; - float* d_fitness; - -}; - -class CCuda { -public: - void* cudaBuffer; - unsigned sizeOfIndividualImpl; - struct gpuOptions initOpts; -public: - CCuda(unsigned parentSize, unsigned offSize, unsigned individualImplSize); - ~CCuda(); -}; - -bool repartition(struct my_struct_gpu* gpu_infos); - -#endif /* CCUDA_H_ */ diff --git a/libeasea/include/Parameters.h b/libeasea/include/Parameters.h index ac6f0fe..4f16e73 100755 --- a/libeasea/include/Parameters.h +++ b/libeasea/include/Parameters.h @@ -78,6 +78,9 @@ public: char* outputFilename; char* plotOutputFilename; + int fstGpu; + int lstGpu; + public: #ifdef WIN32 Parameters(); diff --git a/tpl/CUDA.tpl b/tpl/CUDA.tpl index 3a68f72..6d8ab4f 100755 --- a/tpl/CUDA.tpl +++ b/tpl/CUDA.tpl @@ -1,4 +1,4 @@ -eTEMPLATE_START +\TEMPLATE_START #ifdef WIN32 #define _CRT_SECURE_NO_WARNINGS #pragma comment(lib, "libEasea.lib") @@ -78,7 +78,6 @@ int main(int argc, char** argv){ #include "CEvolutionaryAlgorithm.h" #include "global.h" #include "CIndividual.h" -#include "CCuda.h" #include @@ -94,6 +93,9 @@ extern CEvolutionaryAlgorithm *EA; struct gpuArg* gpuArgs; +int fstGpu = 0; +int lstGpu = 0; + struct my_struct_gpu* gpu_infos; float* fitnessTemp; @@ -111,42 +113,39 @@ PopulationImpl* Pop = NULL; \INSERT_USER_FUNCTIONS void cudaPreliminaryProcess(unsigned PopulationSize){ - int capacite_max = 0; - - //Recuperation of each device information's. - for( int index = 0; index < num_gpus; index++){ - cudaDeviceProp deviceProp; - cudaGetDeviceProperties(&deviceProp, index); - - gpu_infos[index].num_MP = deviceProp.multiProcessorCount*2; //Two block on each MP - gpu_infos[index].num_thread_max = deviceProp.maxThreadsPerBlock*0.5; //We are going to use 50% of the real maximun thread per block, we want to be sure to have enough memory for all of them. - gpu_infos[index].num_Warp = deviceProp.warpSize; - capacite_max += gpu_infos[index].num_MP * gpu_infos[index].num_thread_max; - } - - int count = 0; - - //We can have different cards that's why we are going to put more or less individuals on each of them according to their respective capacity. - for( int index = 0; index < num_gpus; index++){ - gpu_infos[index].indiv_start = count; - //On the first cards we are going to place a maximun of individuals. - if(index != (num_gpus - 1)) - gpu_infos[index].sh_pop_size = ceil((float)PopulationSize * (((float)gpu_infos[index].num_MP*(float)gpu_infos[index].num_thread_max) / (float)capacite_max) ); - //On the last card we are going to place the remaining individuals. - else - gpu_infos[index].sh_pop_size = PopulationSize - count; - - count += gpu_infos[index].sh_pop_size; - - /* - * The number of thread will be a multiple of the number of Warp less than or equal at the maximun number of thread per block. - * The number of block will be a multiple of the double of MP. - */ - if( !repartition(&gpu_infos[index])) - exit( -1 ); - std::cout << "Device number : " << index << " Number of block : " << gpu_infos[index].dimGrid << std::endl; - std::cout << "Device number : " << index << " Number of thread : " << gpu_infos[index].dimBlock << std::endl; - } + int noTotalMP = 0; // number of MP will be used to distribute the population + int count = 0; + + //Recuperation of each device information's. + for( int index = 0; index < num_gpus; index++){ + cudaDeviceProp deviceProp; + cudaError_t lastError = cudaGetDeviceProperties(&deviceProp, index+fstGpu); + if( lastError!=cudaSuccess ){ + std::cerr << "Cannot get device information for device no : " << index+fstGpu << std::endl; + exit(-1); + } + + gpu_infos[index].num_MP = deviceProp.multiProcessorCount; //Two block on each MP + gpu_infos[index].num_thread_max = deviceProp.maxThreadsPerBlock*0.5; //We are going to use 50% of the real maximun thread per block, we want to be sure to have enough memory for all of them. + gpu_infos[index].num_Warp = deviceProp.warpSize; + noTotalMP += gpu_infos[index].num_MP; + gpu_infos[index].gpuProp = deviceProp; + + + } + + for( int index = 0; index < num_gpus; index++){ + + gpu_infos[index].indiv_start = count; + + if(index != (num_gpus - 1)) + gpu_infos[index].sh_pop_size = ceil((float)PopulationSize * (((float)gpu_infos[index].num_MP) / (float)noTotalMP) ); + //On the last card we are going to place the remaining individuals. + else + gpu_infos[index].sh_pop_size = PopulationSize - count; + + count += gpu_infos[index].sh_pop_size; + } } __device__ __host__ inline IndividualImpl* INDIVIDUAL_ACCESS(void* buffer,unsigned id){ @@ -158,6 +157,7 @@ __device__ float cudaEvaluate(void* devBuffer, unsigned id, struct gpuOptions in } +extern "C" __global__ void cudaEvaluatePopulation(void* d_population, unsigned popSize, float* d_fitnesses, struct gpuOptions initOpts){ unsigned id = (blockDim.x*blockIdx.x)+threadIdx.x; // id of the individual computed by this thread @@ -175,9 +175,25 @@ void* gpuThreadMain(void* arg){ cudaError_t lastError; struct gpuArg* localArg = (struct gpuArg*)arg; - cudaSetDevice(localArg->threadId); + std::cout << " gpuId : " << localArg->gpuId << std::endl; + lastError = cudaSetDevice(localArg->gpuId); + if( lastError != cudaSuccess ){ + std::cerr << "Error, cannot set device properly for device no " << localArg->gpuId << std::endl; + exit(-1); + } + int nbr_cudaPreliminaryProcess = 2; + struct my_struct_gpu* localGpuInfo = gpu_infos+localArg->threadId; + + struct cudaFuncAttributes attr; + lastError = cudaFuncGetAttributes(&attr,"cudaEvaluatePopulation"); + + if( lastError != cudaSuccess ){ + std::cerr << "Error, cannot get function attribute for cudaEvaluatePopulation on card: " << localGpuInfo->gpuProp.name << std::endl; + exit(-1); + } + // Because of the context of each GPU thread, we have to put all user's CUDA // initialisation here if we want to use them in the GPU, otherwise they are // not found in the GPU context @@ -187,22 +203,61 @@ void* gpuThreadMain(void* arg){ while(1){ sem_wait(&localArg->sem_in); if( freeGPU ) { - cudaFree(localArg->d_fitness); - cudaFree(localArg->d_population); - break; + // do we need to free gpu memory + cudaFree(localArg->d_fitness); + cudaFree(localArg->d_population); + break; } if(nbr_cudaPreliminaryProcess > 0) { - lastError = cudaMalloc(&localArg->d_population,gpu_infos[localArg->threadId].sh_pop_size*(sizeof(IndividualImpl))); - lastError = cudaMalloc(((void**)&localArg->d_fitness),gpu_infos[localArg->threadId].sh_pop_size*sizeof(float)); - nbr_cudaPreliminaryProcess--; - } - lastError = cudaMemcpy(localArg->d_population,(IndividualImpl*)(Pop->cuda->cudaBuffer)+gpu_infos[localArg->threadId].indiv_start,(sizeof(IndividualImpl)*gpu_infos[localArg->threadId].sh_pop_size),cudaMemcpyHostToDevice); + // we should free GPU buffers when + + // here we will compute how to spread the population to evaluate on GPGPU cores + int thLimit = attr.maxThreadsPerBlock; + int N = localGpuInfo->sh_pop_size; + int w = localGpuInfo->gpuProp.warpSize; + + int b=0,t=0; + + do{ + b += localGpuInfo->num_MP; + t = ceilf( MIN(thLimit,(float)N/b)/w)*w; + } while( (b*tthLimit ); + + if( localArg->d_population!=NULL ){ cudaFree(localArg->d_population); } + if( localArg->d_fitness!=NULL ){ cudaFree(localArg->d_fitness); } + + lastError = cudaMalloc(&localArg->d_population,localGpuInfo->sh_pop_size*(sizeof(IndividualImpl))); + lastError = cudaMalloc(((void**)&localArg->d_fitness),localGpuInfo->sh_pop_size*sizeof(float)); + + std::cout << "card (" << localArg->threadId << ") " << localGpuInfo->gpuProp.name << " has " << localGpuInfo->sh_pop_size << " individual to evaluate" + << ": t=" << t << " b: " << b << std::endl; + localGpuInfo->dimGrid = b; + localGpuInfo->dimBlock = t; + + nbr_cudaPreliminaryProcess--; + + if( b*t!=N ){ + // due to lack of individuals, the population distribution is not optimial according to core organisation + // warn the user and propose a proper configuration + std::cerr << "Warning, population distribution is not optimial, consider adding " << (b*t-N) << " individuals to " + << (nbr_cudaPreliminaryProcess==2?"parent":"offspring")<<" population" << std::endl; + } + } + + lastError = cudaMemcpy(localArg->d_population,(IndividualImpl*)(Pop->cuda->cudaBuffer)+gpu_infos[localArg->threadId].indiv_start, + (sizeof(IndividualImpl)*gpu_infos[localArg->threadId].sh_pop_size),cudaMemcpyHostToDevice); - cudaEvaluatePopulation<<< gpu_infos[localArg->threadId].dimGrid, gpu_infos[localArg->threadId].dimBlock>>>(localArg->d_population,gpu_infos[localArg->threadId].sh_pop_size,localArg->d_fitness,Pop->cuda->initOpts); + // the real GPU computation (kernel launch) + cudaEvaluatePopulation<<< localGpuInfo->dimGrid, localGpuInfo->dimBlock>>>(localArg->d_population, localGpuInfo->sh_pop_size, + localArg->d_fitness,Pop->cuda->initOpts); + if( cudaGetLastError()!=cudaSuccess ){ std::cerr << "Error during synchronize" << std::endl; } + + // be sure the GPU has finished computing evaluations, and get results to CPU lastError = cudaThreadSynchronize(); - - lastError = cudaMemcpy(fitnessTemp + gpu_infos[localArg->threadId].indiv_start, localArg->d_fitness, gpu_infos[localArg->threadId].sh_pop_size*sizeof(float), cudaMemcpyDeviceToHost); + if( lastError!=cudaSuccess ){ std::cerr << "Error during synchronize" << std::endl; } + lastError = cudaMemcpy(fitnessTemp + localGpuInfo->indiv_start, localArg->d_fitness, localGpuInfo->sh_pop_size*sizeof(float), cudaMemcpyDeviceToHost); + // this thread has finished its phase, so lets tell it to the main thread sem_post(&localArg->sem_out); } sem_post(&localArg->sem_out); @@ -213,22 +268,31 @@ void* gpuThreadMain(void* arg){ void wake_up_gpu_thread(){ for( int i=0 ; iqueryGpuNum || fstGpu<0 || lstGpu>queryGpuNum){ + std::cerr << "Error, not enough devices found on the system ("<< queryGpuNum <<") to satisfy user configuration ["<population = (CPopulation*)new PopulationImpl(this->params->parentPopulationSize,this->params->offspringPopulationSize, this->params->pCrossover,this->params->pMutation,this->params->pMutationPerGene,this->params->randomGenerator,this->params); + + // warning cstats parameter is null + this->population = (CPopulation*)new PopulationImpl(this->params->parentPopulationSize,this->params->offspringPopulationSize, this->params->pCrossover,this->params->pMutation,this->params->pMutationPerGene,this->params->randomGenerator,this->params,NULL); ((PopulationImpl*)this->population)->cuda = new CCuda(params->parentPopulationSize, params->offspringPopulationSize, sizeof(IndividualImpl)); Pop = ((PopulationImpl*)this->population); ; @@ -566,7 +650,7 @@ EvolutionaryAlgorithmImpl::~EvolutionaryAlgorithmImpl(){ } -PopulationImpl::PopulationImpl(unsigned parentPopulationSize, unsigned offspringPopulationSize, float pCrossover, float pMutation, float pMutationPerGene, CRandomGenerator* rg, Parameters* params) : CPopulation(parentPopulationSize, offspringPopulationSize, pCrossover, pMutation, pMutationPerGene, rg, params){ +PopulationImpl::PopulationImpl(unsigned parentPopulationSize, unsigned offspringPopulationSize, float pCrossover, float pMutation, float pMutationPerGene, CRandomGenerator* rg, Parameters* params, CStats* stats) : CPopulation(parentPopulationSize, offspringPopulationSize, pCrossover, pMutation, pMutationPerGene, rg, params, stats){ ; } @@ -584,8 +668,8 @@ PopulationImpl::~PopulationImpl(){ #include #include #include -#include #include +#include using namespace std; @@ -660,7 +744,7 @@ class PopulationImpl: public CPopulation { public: CCuda *cuda; public: - PopulationImpl(unsigned parentPopulationSize, unsigned offspringPopulationSize, float pCrossover, float pMutation, float pMutationPerGene, CRandomGenerator* rg, Parameters* params); + PopulationImpl(unsigned parentPopulationSize, unsigned offspringPopulationSize, float pCrossover, float pMutation, float pMutationPerGene, CRandomGenerator* rg, Parameters* params, CStats* stats); virtual ~PopulationImpl(); void evaluateParentPopulation(); void evaluateOffspringPopulation(); @@ -673,7 +757,7 @@ NVCC= nvcc CPPC= g++ LIBAESAE=$(EZ_PATH)libeasea/ CXXFLAGS+=-g -Wall -O2 -I$(LIBAESAE)include -I$(EZ_PATH)boost -LDFLAGS=$(EZ_PATH)boost/program_options.a $(LIBAESAE)libeasea.a -lpthread +LDFLAGS=$(EZ_PATH)boost/program_options.a $(LIBAESAE)libeasea.a -lpthread @@ -691,8 +775,8 @@ OBJ= $(EASEA_SRC:.cpp=.o) $(EASEA_MAIN_HDR:.cpp=.o) #USER MAKEFILE OPTIONS : \INSERT_MAKEFILE_OPTION#END OF USER MAKEFILE OPTIONS -CPPFLAGS+= -I$(LIBAESAE)include -I$(EZ_PATH)boost -NVCCFLAGS+= --compiler-options -fpermissive +CPPFLAGS+= -I$(LIBAESAE)include -I$(EZ_PATH)boost -I/usr/local/cuda/include/ +NVCCFLAGS+= #--ptxas-options="-v"# --gpu-architecture sm_23 --compiler-options -fpermissive BIN= EASEA -- GitLab