Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
E
easea
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Packages & Registries
Packages & Registries
Container Registry
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Arnaud Kress
easea
Commits
201de056
Commit
201de056
authored
Jan 04, 2010
by
maitre
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
Multi-gpu version of drone model regression
parent
9189089b
Changes
2
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
566 additions
and
85 deletions
+566
-85
dev/tgp_regression/tgp_regression.ez
dev/tgp_regression/tgp_regression.ez
+249
-83
dev/tgp_regression/tgp_regressionEval.cu
dev/tgp_regression/tgp_regressionEval.cu
+317
-2
No files found.
dev/tgp_regression/tgp_regression.ez
View file @
201de056
This diff is collapsed.
Click to expand it.
dev/tgp_regression/tgp_regressionEval.cu
View file @
201de056
...
...
@@ -7,6 +7,34 @@
#define BIG_NUMBER 1.0E15f
struct
gpuArg
{
int
threadId
;
sem_t
sem_in
;
sem_t
sem_out
;
float
*
progs_k
;
float
*
results_k
;
int
*
indexes_k
;
int
*
hits_k
;
float
*
inputs_k
;
float
*
outputs_k
;
int
index_st
;
int
index_end
;
int
indiv_st
;
int
indiv_end
;
};
struct
gpuArg
*
gpuArgs
;
bool
freeGPU
=
false
;
int
sh_pop_size
=
0
;
int
sh_length
=
0
;
__global__
static
void
EvaluatePostFixIndividuals_128
(
const
float
*
k_progs
,
const
int
maxprogssize
,
...
...
@@ -30,7 +58,7 @@ EvaluatePostFixIndividuals_128(const float * k_progs,
float
sum
=
0.0
;
int
hits
=
0
;
// hits number
float
current
X
,
currentOutput
;
float
current
W
,
currentX
,
currentY
,
currentZ
,
currentOutput
;
float
result
;
int
start_prog
;
int
codop
;
...
...
@@ -61,7 +89,11 @@ EvaluatePostFixIndividuals_128(const float * k_progs,
if
(
i
*
NUMTHREAD2
+
tid
>=
trainingSetSize
)
// no!
continue
;
currentX
=
k_inputs
[
i
*
NUMTHREAD2
+
tid
];
currentW
=
k_inputs
[(
i
*
NUMTHREAD2
*
VAR_LEN
)
+
tid
*
VAR_LEN
+
0
];
currentX
=
k_inputs
[(
i
*
NUMTHREAD2
*
VAR_LEN
)
+
tid
*
VAR_LEN
+
1
];
currentY
=
k_inputs
[(
i
*
NUMTHREAD2
*
VAR_LEN
)
+
tid
*
VAR_LEN
+
2
];
currentZ
=
k_inputs
[(
i
*
NUMTHREAD2
*
VAR_LEN
)
+
tid
*
VAR_LEN
+
3
];
currentOutput
=
k_outputs
[
i
*
NUMTHREAD2
+
tid
];
start_prog
=
k_indexes
[
index
];
// index of first codop
...
...
@@ -73,8 +105,17 @@ EvaluatePostFixIndividuals_128(const float * k_progs,
switch
(
codop
)
{
case
OP_W
:
stack
[
sp
++
]
=
currentW
;
break
;
case
OP_X
:
stack
[
sp
++
]
=
currentX
;
break
;
case
OP_Y
:
stack
[
sp
++
]
=
currentY
;
break
;
case
OP_Z
:
stack
[
sp
++
]
=
currentZ
;
break
;
case
OP_ERC
:
tmp
=
k_progs
[
start_prog
++
];
stack
[
sp
++
]
=
tmp
;
...
...
@@ -150,3 +191,277 @@ EvaluatePostFixIndividuals_128(const float * k_progs,
}
// here results and hits have been stored in their respective array: we can leave
}
__global__
static
void
EvaluatePostFixIndividuals_128_mgpu
(
const
float
*
k_progs
,
const
int
maxprogssize
,
const
int
popsize
,
const
float
*
k_inputs
,
const
float
*
k_outputs
,
const
int
trainingSetSize
,
float
*
k_results
,
int
*
k_hits
,
int
*
k_indexes
,
int
start_index
,
int
gpu_id
)
{
__shared__
float
tmpresult
[
NUMTHREAD2
];
__shared__
float
tmphits
[
NUMTHREAD2
];
const
int
tid
=
threadIdx
.
x
;
//0 to NUM_THREADS-1
const
int
bid
=
blockIdx
.
x
;
// 0 to NUM_BLOCKS-1
int
index
;
// index of the prog processed by the block
float
sum
=
0.0
;
int
hits
=
0
;
// hits number
float
currentW
,
currentX
,
currentY
,
currentZ
,
currentOutput
;
float
result
;
int
start_prog
;
int
codop
;
float
stack
[
MAX_STACK
];
int
sp
;
float
op1
,
op2
;
float
tmp
;
index
=
bid
;
// one program per block => block ID = program number
if
(
index
>=
popsize
)
// idle block (should never occur)
return
;
if
(
k_progs
[
index
]
==
-
1.0
)
// already evaluated
return
;
// Here, it's a busy thread
sum
=
0.0
;
hits
=
0
;
// hits number
// Loop on training cases, per cluster of 32 cases (= number of thread)
// (even if there are only 8 stream processors, we must spawn at least 32 threads)
// We loop from 0 to upper bound INCLUDED in case trainingSetSize is not
// a multiple of NUMTHREAD
for
(
int
i
=
0
;
i
<
((
trainingSetSize
-
1
)
>>
LOGNUMTHREAD2
)
+
1
;
i
++
)
{
// are we on a busy thread?
if
(
i
*
NUMTHREAD2
+
tid
>=
trainingSetSize
)
// no!
continue
;
currentW
=
k_inputs
[(
i
*
NUMTHREAD2
*
VAR_LEN
)
+
tid
*
VAR_LEN
+
0
];
currentX
=
k_inputs
[(
i
*
NUMTHREAD2
*
VAR_LEN
)
+
tid
*
VAR_LEN
+
1
];
currentY
=
k_inputs
[(
i
*
NUMTHREAD2
*
VAR_LEN
)
+
tid
*
VAR_LEN
+
2
];
currentZ
=
k_inputs
[(
i
*
NUMTHREAD2
*
VAR_LEN
)
+
tid
*
VAR_LEN
+
3
];
currentOutput
=
k_outputs
[
i
*
NUMTHREAD2
+
tid
];
start_prog
=
k_indexes
[
index
]
-
start_index
;
// index of first codop
codop
=
k_progs
[
start_prog
++
];
sp
=
0
;
// stack and stack pointer
while
(
codop
!=
OP_RETURN
){
switch
(
codop
)
{
case
OP_W
:
stack
[
sp
++
]
=
currentW
;
break
;
case
OP_X
:
stack
[
sp
++
]
=
currentX
;
break
;
case
OP_Y
:
stack
[
sp
++
]
=
currentY
;
break
;
case
OP_Z
:
stack
[
sp
++
]
=
currentZ
;
break
;
case
OP_ERC
:
tmp
=
k_progs
[
start_prog
++
];
stack
[
sp
++
]
=
tmp
;
break
;
case
OP_MUL
:
sp
--
;
op1
=
stack
[
sp
];
sp
--
;
op2
=
stack
[
sp
];
stack
[
sp
]
=
__fmul_rz
(
op1
,
op2
);
stack
[
sp
]
=
op1
*
op2
;
sp
++
;
break
;
case
OP_ADD
:
sp
--
;
op1
=
stack
[
sp
];
sp
--
;
op2
=
stack
[
sp
];
stack
[
sp
]
=
__fadd_rz
(
op1
,
op2
);
stack
[
sp
]
=
op1
+
op2
;
sp
++
;
break
;
case
OP_SUB
:
sp
--
;
op1
=
stack
[
sp
];
sp
--
;
op2
=
stack
[
sp
];
stack
[
sp
]
=
op2
-
op1
;
sp
++
;
break
;
case
OP_DIV
:
sp
--
;
op2
=
stack
[
sp
];
sp
--
;
op1
=
stack
[
sp
];
if
(
op2
==
0.0
)
stack
[
sp
]
=
1.0
;
else
stack
[
sp
]
=
op1
/
op2
;
sp
++
;
break
;
}
// get next codop
codop
=
k_progs
[
start_prog
++
];
}
// codop interpret loop
result
=
fabsf
(
stack
[
0
]
-
currentOutput
);
if
(
!
(
result
<
BIG_NUMBER
))
result
=
BIG_NUMBER
;
else
if
(
result
<
PROBABLY_ZERO
)
result
=
0.0
;
if
(
result
<=
HIT_LEVEL
)
hits
++
;
sum
+=
result
;
// sum raw error on all training cases
}
// LOOP ON TRAINING CASES
// gather results from all threads => we need to synchronize
tmpresult
[
tid
]
=
sum
;
tmphits
[
tid
]
=
hits
;
__syncthreads
();
if
(
tid
==
0
)
{
for
(
int
i
=
1
;
i
<
NUMTHREAD2
;
i
++
)
{
tmpresult
[
0
]
+=
tmpresult
[
i
];
tmphits
[
0
]
+=
tmphits
[
i
];
}
k_results
[
index
]
=
tmpresult
[
0
];
k_hits
[
index
]
=
tmphits
[
0
];
//printf("g %d %d %f\n",gpu_id,bid,k_results[index]);
//fflush(stdout);
}
// here results and hits have been stored in their respective array: we can leave
}
void
wake_up_gpu_thread
(
int
nbGpu
){
for
(
int
i
=
0
;
i
<
nbGPU
;
i
++
){
DEBUG_PRT
(
"wake up th %d"
,
i
);
//fflush(stdout);
sem_post
(
&
(
gpuArgs
[
i
].
sem_in
));
}
for
(
int
i
=
0
;
i
<
nbGPU
;
i
++
){
sem_wait
(
&
gpuArgs
[
i
].
sem_out
);
}
}
void
notify_gpus
(
float
*
progs
,
int
*
indexes
,
int
length
,
CIndividual
**
population
,
int
popSize
,
int
nbGpu
){
int
pop_chunk_len
=
popSize
/
nbGpu
;
//cout << " population chunk length : " << pop_chunk_len << "/" << length << endl;
assert
(
nbGpu
==
2
);
int
len1
=
indexes
[
pop_chunk_len
];
int
len2
=
length
-
indexes
[
pop_chunk_len
];
fprintf
(
gpu_stat_file
,
"%d,%d,%f,"
,
len1
,
len2
,(
double
)
len1
/
len2
);
sh_pop_size
=
pop_chunk_len
;
sh_length
=
length
;
wake_up_gpu_thread
(
nbGpu
);
}
/**
Send input and output data on the GPU memory.
Allocate
*/
void
initialDataToMGPU
(
float
*
input_f
,
int
length_input
,
float
*
output_f
,
int
length_output
,
int
gpu_id
){
// allocate and copy input/output arrays
CUDA_SAFE_CALL
(
cudaMalloc
((
void
**
)(
&
(
gpuArgs
[
gpu_id
].
inputs_k
)),
sizeof
(
float
)
*
length_input
));
CUDA_SAFE_CALL
(
cudaMalloc
((
void
**
)(
&
(
gpuArgs
[
gpu_id
].
outputs_k
)),
sizeof
(
float
)
*
length_output
));
CUDA_SAFE_CALL
(
cudaMemcpy
((
gpuArgs
[
gpu_id
].
inputs_k
),
input_f
,
sizeof
(
float
)
*
length_input
,
cudaMemcpyHostToDevice
));
CUDA_SAFE_CALL
(
cudaMemcpy
((
gpuArgs
[
gpu_id
].
outputs_k
),
output_f
,
sizeof
(
float
)
*
length_output
,
cudaMemcpyHostToDevice
));
// allocate indexes and programs arrays
int
maxPopSize
=
MAX
(
EA
->
population
->
parentPopulationSize
,
EA
->
population
->
offspringPopulationSize
);
CUDA_SAFE_CALL
(
cudaMalloc
((
void
**
)
&
(
gpuArgs
[
gpu_id
].
indexes_k
),
sizeof
(
*
indexes_k
)
*
maxPopSize
));
CUDA_SAFE_CALL
(
cudaMalloc
((
void
**
)
&
(
gpuArgs
[
gpu_id
].
progs_k
),
sizeof
(
*
progs_k
)
*
MAX_PROGS_SIZE
));
// allocate hits and results arrays
CUDA_SAFE_CALL
(
cudaMalloc
((
void
**
)
&
(
gpuArgs
[
gpu_id
].
results_k
),
sizeof
(
*
indexes_k
)
*
maxPopSize
));
CUDA_SAFE_CALL
(
cudaMalloc
((
void
**
)
&
(
gpuArgs
[
gpu_id
].
hits_k
),
sizeof
(
*
indexes_k
)
*
maxPopSize
));
}
void
*
gpuThreadMain
(
void
*
arg
){
struct
gpuArg
*
localArg
=
(
struct
gpuArg
*
)
arg
;
DEBUG_PRT
(
"gpu th %d"
,
localArg
->
threadId
);
CUDA_SAFE_CALL
(
cudaSetDevice
(
localArg
->
threadId
));
// Alloc memory for this thread
initialDataToMGPU
(
inputs_f
,
fitnessCasesSetLength
*
VAR_LEN
,
outputs_f
,
fitnessCasesSetLength
*
NB_TREES
,
localArg
->
threadId
);
DEBUG_PRT
(
"allocation ok for th %d"
,
localArg
->
threadId
);
//sem_post(&localArg->sem_out);
// Wait for population to evaluate.
while
(
1
){
sem_wait
(
&
localArg
->
sem_in
);
int
indiv_st
=
localArg
->
threadId
*
sh_pop_size
;
int
indiv_end
=
indiv_st
+
sh_pop_size
;
int
index_st
=
indexes
[
indiv_st
];
int
index_end
=
0
;
if
(
localArg
->
threadId
!=
nbGPU
-
1
)
index_end
=
indexes
[
indiv_end
];
else
index_end
=
sh_length
;
if
(
freeGPU
)
break
;
/* cout << "gpu " << localArg->threadId << " has been notified" << endl; */
/* cout << indiv_st << "|" << indiv_end << "|" << index_st << "|" << index_end << endl; */
/* fflush(stdout); */
//here we copy assigned population chunk to the related GPU
CUDA_SAFE_CALL
(
cudaMemcpy
(
localArg
->
indexes_k
,
indexes
+
indiv_st
,
(
indiv_end
-
indiv_st
)
*
sizeof
(
int
),
cudaMemcpyHostToDevice
));
CUDA_SAFE_CALL
(
cudaMemcpy
(
localArg
->
progs_k
,
progs
+
index_st
,
(
index_end
-
index_st
)
*
sizeof
(
int
),
cudaMemcpyHostToDevice
));
EvaluatePostFixIndividuals_128_mgpu
<<<
sh_pop_size
,
128
>>>
(
localArg
->
progs_k
,
index_end
-
index_st
,
sh_pop_size
,
localArg
->
inputs_k
,
localArg
->
outputs_k
,
NB_FITNESS_CASES
,
localArg
->
results_k
,
localArg
->
hits_k
,
localArg
->
indexes_k
,
index_st
,
localArg
->
threadId
);
/* cudaThreadSynchronize(); */
CUDA_SAFE_CALL
(
cudaMemcpy
(
results
+
(
localArg
->
threadId
*
sh_pop_size
),
localArg
->
results_k
,
(
indiv_end
-
indiv_st
)
*
sizeof
(
int
),
cudaMemcpyDeviceToHost
));
sem_post
(
&
localArg
->
sem_out
);
}
DEBUG_PRT
(
"gpu : %d"
,
localArg
->
threadId
);
DEBUG_PRT
(
"addr k_prog : %p"
,
localArg
->
progs_k
);
CUDA_SAFE_CALL
(
cudaFree
(
localArg
->
progs_k
));
CUDA_SAFE_CALL
(
cudaFree
(
localArg
->
results_k
));
CUDA_SAFE_CALL
(
cudaFree
(
localArg
->
hits_k
));
CUDA_SAFE_CALL
(
cudaFree
(
localArg
->
inputs_k
));
CUDA_SAFE_CALL
(
cudaFree
(
localArg
->
outputs_k
));
sem_post
(
&
localArg
->
sem_out
);
cout
<<
"gpu "
<<
localArg
->
threadId
<<
" has been freed"
<<
endl
;
fflush
(
stdout
);
return
NULL
;
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment