GPU.tpl 20.8 KB
Newer Older
moh_lo's avatar
moh_lo committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
\TEMPLATE_START// -*- mode: c++; c-indent-level: 2; c++-member-init-indent: 8; comment-column: 35; -*-
//
// (The above line is useful in Emacs-like editors)
//
//****************************************
//                                         
//  EASEA.cpp
//                                         
//  C++ file generated by AESAE-CUDA v0.9a
//                                         
//****************************************


\ANALYSE_PARAMETERS
#include <unistd.h>
16
#include "tool/outputea.h"
moh_lo's avatar
moh_lo committed
17
#include <stdio.h>
18
#include "tool/tool.h"
moh_lo's avatar
moh_lo committed
19
20
21
22
23
24

#define TAILLE_POP \POP_SIZE
#define TAILLE_POP_ENFANTS \OFF_SIZE
#define PRESSION_SELECTION 2
#define PRESSION_REPLACEMENT 2
#define NB_GENERATION \NB_GEN
25
26
27
28
#define GPGPU
#include "EASEAIndividual.h"  //Generated individual
#include "EASEAGPUEval.h"     //Generated header for gpu evaluation
#include "EASEAUserFunc.h"
moh_lo's avatar
moh_lo committed
29
30
31
32
33
34
35
36
37

//INSERT_INITIALISATION_FUNCTION 
\INSERT_INITIALISATION_FUNCTION 



//static inline size_t tournoi(CIndividu** population, size_t popSize, size_t pression){
size_t tournoi(CIndividu** population, size_t popSize, size_t pression){
  size_t meilleurIndividu = 0;
38
  float meilleurFitness = 0;
moh_lo's avatar
moh_lo committed
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
  
  for( size_t i = 0 ; i<pression ; i++ ){
    size_t index = getRandomIntMax(popSize);
    if(population[index]->nFitness > meilleurFitness){
      meilleurFitness = population[index]->nFitness;
      meilleurIndividu = index;
    }
  }
  return meilleurIndividu;
}



CIndividu* selection(CIndividu** population, size_t popSize){
  size_t meilleurIndividu = tournoi(population,popSize,PRESSION_SELECTION);
  return population[meilleurIndividu];
}


CIndividu* remplacement(CIndividu** population, size_t popSize){
  size_t meilleurIndividu = tournoi(population,popSize,PRESSION_REPLACEMENT);
  CIndividu* selected = population[meilleurIndividu];
  //suppression de l'individu selectionne de la population courante
  population[meilleurIndividu] = population[popSize-1];
  return selected;
}


67
#define GPU_BUFFER_POS(buffer,index) (buffer+sizeof(genome_t)*index)
moh_lo's avatar
moh_lo committed
68
69
70
71
72
73
74


#ifdef GPGPU
void gpuEvaluation(char* parentGenomes,CIndividu** pPopParents,size_t popSize){
  OutputEa oeaNull;

  // compute fitness for all initial population on gpu
75
  float* results = launch_krnl(popSize,parentGenomes);
moh_lo's avatar
moh_lo committed
76
  // write fitness in corresponding individuals
77
78
  for (size_t i=0;i<popSize;i++){
    printf("%p %f\n",pPopParents[i],results[i]);
moh_lo's avatar
moh_lo committed
79
    pPopParents[i]->setFitness(results[i]);
80
  }
moh_lo's avatar
moh_lo committed
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
  delete[] results;
}
#endif


int main(){
	
	
  /// declarations
	
  //FILE *fpFichierSauvegarde;
  CIndividu *tPop1[TAILLE_POP+TAILLE_POP_ENFANTS]; 
  CIndividu *tPop2[TAILLE_POP+TAILLE_POP_ENFANTS];
  CIndividu **pTemp;
	
  CIndividu **pPopCourante=(CIndividu **) tPop1;
  CIndividu **pNouvellePop=(CIndividu **) tPop2;
	
  CIndividu **pPopParents=pPopCourante; // Pointeur sur la population de parents
  CIndividu **pPopEnfants=&(pPopCourante[TAILLE_POP]); // Pointeur sur
101
102
  char* parentGenomes = (char*)malloc(sizeof(genome_t)*TAILLE_POP);
  char* offspringGenomes = (char*)malloc(sizeof(genome_t)*TAILLE_POP_ENFANTS);
moh_lo's avatar
moh_lo committed
103
104
105
106
107
108
109
110
111
112
113
114
115
  // la population
  // d'enfants
  float fPMut=0.05f;
  // l'ide'e est de  faire une mutation
  // par enfant cre'e' en moyenne, et donc
  // une probabilite' de mutation de
  // 1/nb_de_gènes
  float fPCross=1;  // Probabilite' d'appel du crossover
  int bElitisme=1;  // Pour commencer, on va utiliser de l'e'litisme
  int i,nNbEnfants=0,nTailleNouvellePop=0,nTaillePopCourante=0;
  //  int fIntensity =39 ; // pour l'instant c'est l'init defaut
  size_t generationCourante = 0;
	
116
  showInfo();
moh_lo's avatar
moh_lo committed
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
	
  printf( "\n----> Cre'ation de la population : \n");
	
  // Cre'ation et initialisation de la population
  for (i=0;i<TAILLE_POP;i++) {
    pPopParents[i]=new CIndividu(GPU_BUFFER_POS(parentGenomes,i));
    printf("\npPopParents[%d]=%p\n",i,pPopParents[i]); 

  }

  
  for( i=TAILLE_POP; i<TAILLE_POP_ENFANTS ; i++)
    pPopEnfants[i] = NULL;
  
  for( i=0 ; i<TAILLE_POP+TAILLE_POP_ENFANTS; i++)
    pNouvellePop[i] = NULL;

  printf("\n----> Evaluation de la population \n");
	
  // Evaluation pour cre'er une population de parents

#ifdef GPGPU
  gpuEvaluation(parentGenomes,pPopParents,TAILLE_POP);
140
  showPopulationBooleanArray(parentGenomes,SIZE,TAILLE_POP);
moh_lo's avatar
moh_lo committed
141
142
143
144
145
146
#else
  for (i=0;i<TAILLE_POP;i++)
    pPopParents[i]->evaluation();
 
#endif
  printf("\n\n");
147
  //exit(-1);
moh_lo's avatar
moh_lo committed
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182

  for (i=0;i<TAILLE_POP;i++)
    printf("%s\n",pPopParents[i]->toString().c_str());


  // Boucle e'volutionnaire
  while( generationCourante < NB_GENERATION ){
    printf("La PopParents contient :\n");
    for(i=0;i<TAILLE_POP;i++)
      printf("%p, ",pPopParents[i]);

    printf("\n");
    printf("\n");
    printf("-----------------------------------------------------------------");
    printf("\n");
		
    while (nNbEnfants<TAILLE_POP_ENFANTS){   // boucle sur la taille des enfants
			
      if (randomLoc(0,1)<fPCross) { // ope'rateur binaire (croisement) 
	printf("\nCroisement :\n");
				
	CIndividu *i1,*i2;
	i1=selection(pPopParents,TAILLE_POP);
				
	do{
	  i2=selection(pPopParents,TAILLE_POP);    // 
	} while (i2 ==i1); // Moui, c'est pas oblige' mais pourquoi pas...
	//rajouter un and afin de  stopper en cas de convergence
				
	if (randomLoc(0,1)<0.5) // micro-subtilite'... ;-)
	  pPopEnfants[nNbEnfants]=i1->croisement(i2,GPU_BUFFER_POS(offspringGenomes,nNbEnfants));  
	else  
	  pPopEnfants[nNbEnfants]=i2->croisement(i1,GPU_BUFFER_POS(offspringGenomes,nNbEnfants));
				
	printf("%p + %p = %p\n",i1,i2,pPopEnfants[nNbEnfants]);
183
	
moh_lo's avatar
moh_lo committed
184
185
186
187
188
189
190
191
      }
      else { // ope'rateur unaire (clonage)
	CIndividu *i1;
				
	i1=selection(pPopParents,TAILLE_POP);  //Il faut e'crire la se'lection
	pPopEnfants[nNbEnfants]=new CIndividu(i1,GPU_BUFFER_POS(offspringGenomes,nNbEnfants));
	printf("Clonage: %p clone' en %p\n",i1,pPopEnfants[nNbEnfants]);
      }
192
      
moh_lo's avatar
moh_lo committed
193
      //printf("\nMutation de %p\n",pPopEnfants[nNbEnfants]);
194
      
moh_lo's avatar
moh_lo committed
195
      pPopEnfants[nNbEnfants]->mutation(fPMut,GPU_BUFFER_POS(offspringGenomes,nNbEnfants)); // mutation.
196
197
      memcpy(GPU_BUFFER_POS(offspringGenomes,nNbEnfants),&(pPopEnfants[nNbEnfants]->genome),sizeof(genome_t));

moh_lo's avatar
moh_lo committed
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
      nNbEnfants++;
    }
    // Bon, et bien on est maintenant au complet (parents + enfants)
    // Il faut maintenant e'valuer tous les enfants
		

    printf("\n");
    printf("-----------------------------------------------------------------");
    printf("\n");


#ifdef GPGPU
    gpuEvaluation(offspringGenomes,pPopEnfants,TAILLE_POP_ENFANTS);
#else
    for (i=0;i<TAILLE_POP_ENFANTS;i++)		
      pPopEnfants[i]->evaluation();
#endif
215
216
217
218
219


    for (i=0;i<TAILLE_POP;i++)
      printf("%s\n",pPopEnfants[i]->toString().c_str());
      
moh_lo's avatar
moh_lo committed
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
		
    printf("\n\n");
    // Il faut maintenant remplir la nouvelle population et on raisonne
    // maintenant en population globale parents+enfants
		
    nTaillePopCourante=TAILLE_POP+TAILLE_POP_ENFANTS;
    nTailleNouvellePop=0;
		
    printf("La PopCourante contient :\n");
    for(i=0;i<nTaillePopCourante;i++)
      printf("%p (%d), ",pPopParents[i],pPopParents[i]->nFitness);
    printf("\n");
		
    // Si on de'cide qu'il y a de l'e'litisme, on commence par un e'litisme mou
		
    if (bElitisme){
			
      int iMeilleur=0;
      for (i=1;i<nTaillePopCourante;i++)
	if (pPopCourante[i]->nFitness>pPopCourante[iMeilleur]->nFitness)
	  iMeilleur=i;
				
      pNouvellePop[0]=pPopCourante[iMeilleur];
				
      printf("----> Elitisme : on recopie %p dans la population d'enfants\n",pNouvellePop[0]);
				
      // Maintenant, on supprime de la population courante l'individu
      // qui a e'te' tranfe're' dans la nouvelle population
      pPopCourante[iMeilleur]=pPopCourante[nTaillePopCourante-1];
      nTaillePopCourante--; nTailleNouvellePop++;
    }
		
    printf("----> Remplacement\n");
    printf("----> Re'capitulatif avant Remplacement:\npPopCourante contient les individus :\n");
    for( i=0;i<nTaillePopCourante;i++)
      printf("%p, ",pPopCourante[i]);
		
    printf("\npNouvellePop contient les individus (dont les %d premiers sont bons) :\n",nTailleNouvellePop);
    for( i=0;i<TAILLE_POP+TAILLE_POP_ENFANTS;i++)
      if (pNouvellePop[i]==NULL) printf("%p, ",pNouvellePop[i]);
			
    while(nTailleNouvellePop<TAILLE_POP){
      pNouvellePop[nTailleNouvellePop]=remplacement(pPopCourante,nTaillePopCourante);
				
      printf("\n\n%p est e'lu\n",pNouvellePop[nTailleNouvellePop]);
      nTaillePopCourante--; 
      nTailleNouvellePop++;
     
    }			

    printf("\n\n----> Re'capitulatif avant mise à jour :\npPopCourante contient les individus :\n");
    for(i=0;i<TAILLE_POP+TAILLE_POP_ENFANTS;i++)
      printf("%p, ",pPopCourante[i]);
			
    printf("\npNouvellePop contient les individus (dont les %d premiers sont bons) :\n",nTailleNouvellePop);
    for( i=0;i<TAILLE_POP+TAILLE_POP_ENFANTS;i++)
      if (pNouvellePop[i]==NULL) printf("%p, ",pNouvellePop[i]);
				
				
    //delete the contant of the current population, i.e. indiviudals that have not been selected by replacement
    for (int j=0;j<nTaillePopCourante;j++) {
      delete pPopCourante[j];
      pPopCourante[j] = NULL;
    }
				
    // Et maintenant, suprême astuce, on e'change les populations !
    // Go to the next population, swap populations
    pTemp=pPopCourante; pPopCourante=pPopParents=pNouvellePop;pNouvellePop=pTemp;
    pPopEnfants=&(pPopCourante[TAILLE_POP]);
				
    printf("PopCourante contient les individus (dont les %d premiers sont bons) :\n",nTailleNouvellePop);
    for( i=0;i<TAILLE_POP+TAILLE_POP_ENFANTS;i++)
      printf("%p, ",pPopCourante[i]);
				
    printf("\npNouvellePop contient ... ce qu'elle contient:\n");
    for( i=0;i<TAILLE_POP+TAILLE_POP_ENFANTS;i++)
      if (pNouvellePop[i]){
	printf("%p, ",pNouvellePop[i]);
      }
					
    printf("\n\n ON EST REPARTIS POUR UN TOUR !!! \n\n");
					
    // et on remet les compteurs à ze'ro
    nNbEnfants=0;
    generationCourante++;
  }


  for(size_t i = 0 ; i<TAILLE_POP ; i++){
    printf("%s\n", pPopCourante[i]->toString().c_str());
  }
  
  // free all internal structures
  for(size_t i = 0 ; i<TAILLE_POP ; i++) delete pPopCourante[i];
  
  free(offspringGenomes);
  free(parentGenomes);
    
  return 0;
}



\ANALYSE_PARAMETERS


326
327
328
329
330
331
332
333
334
335
336
337
\START_USER_FUN_H_TPL
#ifndef USER_FUNC
#define USER_FUNC
//INSERT_USER_FUNCTIONS
\INSERT_USER_FUNCTIONS
//INSERT_USER_DECLARATIONS
\INSERT_USER_DECLARATIONS
#endif



\START_GPU_INDIVIDUAL_H_TPL// -*- mode: c++; c-indent-level: 2; c++-member-init-indent: 8; comment-column: 35; -*-
moh_lo's avatar
moh_lo committed
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
//
// (The above line is useful in Emacs-like editors)
//
//****************************************
//                                         
//  EASEAGenome.h
//                                         
//  C++ file generated by AESAE-EO v0.7b
//                                         
//****************************************
//

\ANALYSE_USER_CLASSES
#ifndef CINDIVIDU_HPP
#define CINDIVIDU_HPP

#define LG_GENOME 16
#include <stdlib.h>
//#include <iostream>
#include <sstream>
358
359
360
361
#include "tool/tool.h"
#include "EASEAUserFunc.h"


moh_lo's avatar
moh_lo committed
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386




class genome_t {
public:
  //INSERT_GENOME
  \INSERT_GENOME

  genome_t(){;}
  void copy(const genome_t& genome){
    \COPY_CTOR
  }
};


class CIndividu{
public:

  CIndividu(char* gpuBuffer);
  CIndividu(const CIndividu *, char* gpuBuffer);
  ~CIndividu();
  float evaluation();
  CIndividu* croisement(const CIndividu* i1,char* gpuBuffer)const;
  bool mutation(float, char* gpuBuffer);
387
  float nFitness; // a transformer en float 
moh_lo's avatar
moh_lo committed
388
389
390
391
392
393
394
395
396
397
398
  std::string toString();
  static size_t genomeSize(){ return sizeof(char)*LG_GENOME;}
  void setFitness(float f){ this->nFitness = f;}

  genome_t genome;
};


void showPopulationBooleanArray(char* population,size_t genSize, size_t popSize);
void showFitnessArray(float* fitnesses, size_t popSize);

399
400
401
#endif
\START_GPU_INDIVIDUAL_CPP_TPL// -*- mode: c++; c-indent-level: 2; c++-member-init-indent: 8; comment-column: 35; -*-
#include "EASEAIndividual.h"
moh_lo's avatar
moh_lo committed
402
403
404
405
406

CIndividu::CIndividu(char* gpuBuffer){
  //GENOME_CTOR
  \GENOME_CTOR
  \INSERT_INITIALISER
407
408
409
410
    ;

  //copy the current genome in the gpuBuffer
  memcpy(gpuBuffer,&(this->genome),sizeof(genome_t));
moh_lo's avatar
moh_lo committed
411
412
413
414
}


CIndividu::CIndividu(const CIndividu* ind, char* gpuBuffer){
415
  
moh_lo's avatar
moh_lo committed
416
417
   //INSERT_INITIALISER
  this->genome.copy(ind->genome);
418
  if(gpuBuffer) memcpy(gpuBuffer,&(this->genome),sizeof(genome_t));
moh_lo's avatar
moh_lo committed
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
}

CIndividu::~CIndividu(){
  //GENOME_DTOR
  \GENOME_DTOR
}

float CIndividu::evaluation(){
  //INSERT_EVALUATOR
  \INSERT_EVALUATOR
}

bool CIndividu::mutation(float fPMut, char* gpuBuffer){
  //INSERT_MUTATOR
  \INSERT_MUTATOR
}

CIndividu* CIndividu::croisement(const CIndividu* i1,char* gpuBuffer) const {
  CIndividu* child1 = new CIndividu(this,NULL);
  CIndividu* child2 = new CIndividu(i1,NULL);
  //INSERT_CROSSOVER
  \INSERT_CROSSOVER
441
442
443
444
    ;
  memcpy(gpuBuffer,&(child1->genome),sizeof(genome_t));
  free(child2);
  return child1;
moh_lo's avatar
moh_lo committed
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
}


std::string CIndividu::toString(){
  std::ostringstream cout;

  //INSERT_DISPLAY
  \INSERT_DISPLAY
  
  cout << " fitness : " << nFitness;
  cout << " addr : " << this;
  return cout.str();
}


460
461
462
463
464
465
466
467
468
469
void showPopulationBooleanArray(char* population,size_t genSize, size_t popSize){
  size_t i,j;

  for( i=0; i<popSize ; i++ ){
    for( j=0 ; j<genSize ; j++){
      printf( " %d |", population[i*genSize+j]);
    }
    printf( "\n");
  }
}
moh_lo's avatar
moh_lo committed
470
471


472
473
void showFitnessArray(float* fitnesses, size_t popSize){
  size_t i;
moh_lo's avatar
moh_lo committed
474

475
476
477
478
  for(  i=0; i<popSize ; i++ ){
    printf("Fitness of %d is %f\n", i, fitnesses[i]);
  }
}
moh_lo's avatar
moh_lo committed
479
480
481
482




483
484
485
486
\START_GPU_EVAL_H_TPL
//START_GPU_EVAL_H_TPL
#ifndef GPU_EVA_H_TPL
#define GPU_EVA_H_TPL
moh_lo's avatar
moh_lo committed
487

488
489
490
float* 
launch_krnl(size_t popSize, BOOLEAN_EA* pop);
void showInfo(void);
moh_lo's avatar
moh_lo committed
491
492
493

#endif

494
495
496
497
498
499
500
501
502
503
504
\START_GPU_EVAL_CU_TPL
//START_GPU_EVAL_CU_TPL
#define DEBUG_KRNL
#include "tool/basetype.h"
#include "tool/tool.h"
#include "tool/timing.h"
#include "tool/debug.h"
#include "EASEAUserFunc.h"
#include "EASEAIndividual.h"
#include <iostream>
#include <assert.h>
moh_lo's avatar
moh_lo committed
505
506


507
using namespace std;
moh_lo's avatar
moh_lo committed
508
509
510



511
512
#define MAX_THREAD_NUM 512
#define NB_MP 8
moh_lo's avatar
moh_lo committed
513
514
515



516
517
#define NB_MP 8
#define MAX_BLOCK_SIZE 512
moh_lo's avatar
moh_lo committed
518
519


520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
bool
repartition(size_t popSize, size_t* nbBlock, size_t* nbThreadPB, size_t* nbThreadLB, 
	    size_t nbMP, size_t maxBlockSize){
  
  (*nbThreadLB) = 0;
  
  if( ((float)popSize / (float)nbMP) <= maxBlockSize ){
    //la population répartie sur les MP tient dans une bloc par MP
    (*nbThreadPB) = partieEntiereSup( (float)popSize/(float)nbMP);
    (*nbBlock) = popSize/(*nbThreadPB);
    if( popSize%nbMP != 0 ){
      //on fait MP-1 block de équivalent et un plus petit
      (*nbThreadLB) = popSize - (*nbThreadPB)*(*nbBlock);
    }
  }
  else{
    //la population est trop grande pour etre répartie sur les MP
    //directement
    (*nbBlock) = partieEntiereSup( (float)popSize/((float)maxBlockSize*8));
    (*nbBlock) *=8;
    (*nbThreadPB) = popSize/(*nbBlock);
    if( popSize%maxBlockSize!=0){
      (*nbThreadLB) = popSize - (*nbThreadPB)*(*nbBlock);
      
      // Le rest est trop grand pour etre placé dans un seul block (c'est possible uniquement qd 
      // le nombre de block dépasse maxBlockSize 
      while( (*nbThreadLB) > maxBlockSize ){
	//on augmente le nombre de blocs principaux jusqu'à ce que nbthreadLB retombe en dessous de maxBlockSize
	(*nbBlock) += nbMP;
 	(*nbThreadPB) = popSize/(*nbBlock);
	(*nbThreadLB) = popSize - (*nbThreadPB)*(*nbBlock);
      }
    }
moh_lo's avatar
moh_lo committed
553
  }
554
555
556
557
558
559
560
561
  
  if((((*nbBlock)*(*nbThreadPB) + (*nbThreadLB))  == popSize) 
     && ((*nbThreadLB) <= maxBlockSize) && ((*nbThreadPB) <= maxBlockSize))
    return true;
  else 
    return false;
}

moh_lo's avatar
moh_lo committed
562
563


564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597

__host__ void showInfo(){
  int devCount,i;
  cudaError_t lastError;
  struct cudaDeviceProp cdp;
  
  CDC(lastError,"cudaGetDeviceCount",cudaGetDeviceCount(&devCount));
  
  printf("Number of device : %d\n",devCount);
  for( i=0 ; i<devCount ; i++ ){
    CDC(lastError,"cudaGetDeviceProperties",cudaGetDeviceProperties(&cdp,i));
    printf("Name : %s\n",cdp.name);
    printf("TotalGlobalMem %d\n",cdp.totalGlobalMem);
    printf("SharedMemPerBlock %d\n",cdp.sharedMemPerBlock);
    printf("regsPerBlock %d\n",cdp.regsPerBlock);
    printf("warpSize %d\n",cdp.warpSize);
    printf("memPitch %d\n",cdp.memPitch);
    printf("maxThreadsPerBlock %d\n",cdp.maxThreadsPerBlock);
    
    printf("maxThreadsDim.x %d\n",cdp.maxThreadsDim[0]);
    printf("maxThreadsDim.y %d\n",cdp.maxThreadsDim[1]);
    printf("maxThreadsDim.z %d\n",cdp.maxThreadsDim[2]);
    
    printf("maxGridSize.x %d\n",cdp.maxGridSize[0]);
    printf("maxGridSize.y %d\n",cdp.maxGridSize[1]);
    printf("maxGridSize.z %d\n",cdp.maxGridSize[2]);
    
    printf("totalConstMem %d\n",cdp.totalConstMem);
    printf("major %d\n",cdp.major);
    printf("minor %d\n",cdp.minor);
    printf("clockRate %d\n",cdp.clockRate);
    printf("textureAlignment %d\n",cdp.textureAlignment);
    printf("deviceOverlap %d\n",cdp.deviceOverlap);
    printf("multiProcessorCount %d\n",cdp.multiProcessorCount);
moh_lo's avatar
moh_lo committed
598
  }
599
}
moh_lo's avatar
moh_lo committed
600
601


602
__host__ __device__ FITNESS_TYPE gpuEvaluate(BOOLEAN_EA* rawGenome){
moh_lo's avatar
moh_lo committed
603
604


605
606
607
608
609
  genome_t* genome = (genome_t*)rawGenome;

  //INSERT_EVALUATOR
  \INSERT_EVALUATOR
}
moh_lo's avatar
moh_lo committed
610
611


612
__global__ void cudaEvaluatePopulationSM(BOOLEAN_EA* d_population, float* d_fitnesses, size_t nbThreadLB){
moh_lo's avatar
moh_lo committed
613

614
615
616
617
  extern __shared__ BOOLEAN_EA s_data[];
  size_t id = blockDim.x*blockIdx.x+threadIdx.x;    
  size_t individual = id*sizeof(genome_t);
  size_t i=0;
moh_lo's avatar
moh_lo committed
618
619


620
621
622
  // last block is the block which computes reminder
  if( blockIdx.x == gridDim.x-1){
    if( threadIdx.x >= nbThreadLB ) return;
moh_lo's avatar
moh_lo committed
623
624
  }

625
626
627
628
629
  //do the copy in the shared memory
  for(i=0;i<sizeof(genome_t);i++) s_data[(threadIdx.x*sizeof(genome_t))+i] = d_population[individual+i];
  
  
  d_fitnesses[id] = gpuEvaluate(s_data+(threadIdx.x*sizeof(genome_t)));
moh_lo's avatar
moh_lo committed
630

631
}
moh_lo's avatar
moh_lo committed
632
633
634
635




636
__global__ void cudaEvaluatePopulation(BOOLEAN_EA* d_population, float* d_fitnesses, size_t nbThreadLB){
moh_lo's avatar
moh_lo committed
637
638


639
640
641
642
643
644
645
  size_t individual = blockDim.x*blockIdx.x*sizeof(genome_t)+threadIdx.x*sizeof(genome_t);
  size_t id = blockDim.x*blockIdx.x+threadIdx.x;
  size_t i=0;

  // last block is the block which computes reminder
  if( blockIdx.x == gridDim.x-1){
    if( threadIdx.x >= nbThreadLB ) return;
moh_lo's avatar
moh_lo committed
646
  }
647
648
649
650
651
652
653
  
  d_fitnesses[id] = 0;
  //real computation
  for( i=0 ; i<sizeof(genome_t) ; i++ )
    if(d_population[individual+i])
      d_fitnesses[id] += 1;
}
moh_lo's avatar
moh_lo committed
654
655
656



657
658
659
660
661
float* 
launch_krnl(size_t popSize, BOOLEAN_EA* pop){
  BOOLEAN_EA* d_population;
  float* fitnessTab, * d_fitnessTab;
  cudaError_t lastError = cudaSuccess;
moh_lo's avatar
moh_lo committed
662

663
664
665
  size_t nbBlock,nbThreadPB,nbThreadLB;
  size_t memSize = sizeof(genome_t) * popSize;
  fitnessTab = new float[popSize];
moh_lo's avatar
moh_lo committed
666

667
668
669
670
  CDC(lastError,"CudaMalloc d_population",cudaMalloc( (void**) &d_population, memSize));
  CDC(lastError,"CudaMalloc d_fitnessTab",cudaMalloc( (void**) &d_fitnessTab, popSize*sizeof(float)));
  
  
moh_lo's avatar
moh_lo committed
671

672
673
674
675
676
677
678
679
680
  CDC(lastError,"CudaMemCpy #1",cudaMemcpy( d_population, pop, memSize, cudaMemcpyHostToDevice));
  //cudaThreadSynchronize();
  
  //compute the repartition over MP and SP
  repartition(popSize,&nbBlock,&nbThreadPB,&nbThreadLB,NB_MP,MAX_THREAD_NUM);
  dim3 dimBlock(nbThreadPB);
  dim3 dimGrid;

  size_t sharedMemSize = nbThreadPB*sizeof(genome_t)*sizeof(BOOLEAN_EA);
moh_lo's avatar
moh_lo committed
681

682
683
684
  
  cout << "repartition -> nbBlock : " << nbBlock << " nbThreadPB : " << nbThreadPB
       << " nbThreadLB : " << nbThreadLB << endl;
moh_lo's avatar
moh_lo committed
685

686
  cout << "Shared memory usage : " << sharedMemSize << endl;
moh_lo's avatar
moh_lo committed
687

688
689
690
  //wird things, take a look. Does the bug of repartition function become from here?
  if( nbThreadLB == 0 )
    dimGrid = dim3(nbBlock+1);
moh_lo's avatar
moh_lo committed
691
  else
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
    dimGrid = dim3(nbBlock+1);


  CDC(
      lastError,
      "Launch kernel",
      (cudaEvaluatePopulationSM<<< dimGrid, dimBlock , sharedMemSize >>>(d_population,d_fitnessTab,nbThreadLB)));
  cudaThreadSynchronize();
  
  CDC(lastError,"CudaMemCpy #2",cudaMemcpy( fitnessTab , d_fitnessTab , popSize*sizeof(float), cudaMemcpyDeviceToHost));

  CDC(lastError,"CudaFree d_population",cudaFree( (void*) d_population));
  CDC(lastError,"CudaFree d_fitnessTab",cudaFree( (void*) d_fitnessTab));

  return fitnessTab;
moh_lo's avatar
moh_lo committed
707
708
}

709
710
711
712
713
714
715
716
717
//START_EO_INITER_TPL
//START_EO_MUT_TPL
//START_EO_QUAD_XOVER_TPL
//START_EO_CONTINUE_TPL
//START_EO_PARAM_TPL
\START_EO_MAKEFILE_TPL
NVCC = nvcc
CPPC = g++
CC = g++
moh_lo's avatar
moh_lo committed
718

719
720
721
NVCCFLAGS = -g -O2 -I/usr/include/libxml2/
CPPFLAGS = $(NVCCFLAGS)
CFLAGS = $(NVCCFLAGS)
moh_lo's avatar
moh_lo committed
722

723
LDFLAGS = -lxml2
moh_lo's avatar
moh_lo committed
724

725
HDR= $(wildcard *.h)
moh_lo's avatar
moh_lo committed
726

727
all:EASEA.out
moh_lo's avatar
moh_lo committed
728
729


730
731
732
733
734
EASEA.out: tool/tool.o EASEAIndividual.o EASEA.o EASEAGPUEval.o
			    $(NVCC) -o $@ $^ $(LDFLAGS) $(NVCCFLAGS)

tool/%.o:tool/%.c tool/%.h
			    $(CC) -c -o $@ $< $(CFLAGS)
moh_lo's avatar
moh_lo committed
735

736
737
%.o:%.cpp $(HDR)
			    $(CPPC) -c -o $@ $< $(CPPFLAGS)
moh_lo's avatar
moh_lo committed
738

739
740
741
742
743
744
%.o:%.cu $(HDR)
			    $(NVCC) -c -o $@ $< $(NVCCFLAGS) --device-emulation

clean:
			    rm *.o EASEA.out
 
moh_lo's avatar
moh_lo committed
745
\TEMPLATE_END