Commit 4792ecc9 authored by balanche's avatar balanche

Ajout de la gestion de moyennes dans le DistanceModel

parent a7fef5f8
......@@ -33,9 +33,11 @@ import jcl.utils.exceptions.MethodNotImplementedException;
* hybride (methode Samarah).
* </p>
*
* Implementation progressable avec liste de sous-processus progressables à suivre par Jean-Noël Balanche
*
* @author WEMMERT Cedric
*
* Implementation progressable avec liste de sous-processus progressable à suivre par Jean-Noël Balanche
*
*/
public abstract class Classification extends Observable implements
......
package jcl.data;
import static java.lang.Math.PI;
import static java.lang.Math.cos;
import static java.lang.Math.sin;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
import jcl.clustering.ClusteringResult;
import jcl.data.attribute.Attribute;
import jcl.data.attribute.AttributeMultiDimSequence;
import jcl.data.attribute.AttributeNumerical;
import jcl.data.attribute.AttributeSequence;
import jcl.data.distance.Distance;
import jcl.data.distance.DistanceParameter;
import jcl.data.distance.EmptyDistanceParameter;
import jcl.data.distance.MetaDistance;
import jcl.data.distance.MetaDistanceEuclidean;
import jcl.data.distance.NumericalEuclideanDistance;
import jcl.data.distance.average.Average;
import jcl.data.distance.average.AverageEuclidean;
import jcl.data.distance.average.AverageParameter;
import jcl.data.distance.average.EmptyAverageParameter;
import jcl.data.distance.average.ParameterDBAMean;
import jcl.data.distance.average.sequential.AverageMDDBAMean;
import jcl.data.distance.average.sequential.AverageMDeuclideanMean;
import jcl.data.distance.sequential.DistanceDTW;
import jcl.data.distance.sequential.DistanceDTWMD;
import jcl.data.distance.sequential.ParameterDTW;
import jcl.data.sampling.Sampler;
import jcl.learning.methods.monostrategy.kmeans.ClassifierKmeans;
import jcl.learning.methods.monostrategy.kmeans.LearningResultKmeans;
import jcl.learning.methods.monostrategy.kmeans.LightHardSeed;
import jcl.learning.methods.monostrategy.kmeans.ParametersKmeans;
/**
* This class represents a classification model.
* It contains the type of every attribute in a {@link Data} as well as the distances to be used to compare the attributes.
* It also contains a meta-distance telling how the distances at the attribute's level are composed.
* TODO It should also contain the averaging method to summarize a set of attributes.
* @author Francois Petitjean
* It also contain the averaging method to summarize a set of attributes.
* @author Francois Petitjean / Jean-Noël Balanche
*/
public class DistanceModel {
/**
......@@ -25,16 +48,20 @@ public class DistanceModel {
*/
protected Distance<Attribute,DistanceParameter>[] distances;
protected MetaDistance metaDistance;
//TODO ajouter les moyennes au nouveau framework
/**
* Table of the averages (cell i of the table = average used for the i^th attribute)
*/
protected Average[] averages;
/**
* @deprecated rather use {@link #Model(Attribute[], Distance[], MetaDistance)}
* @deprecated rather use {@link #Model(Attribute[], Distance[], MetaDistance, Average[])}
*/
public DistanceModel(){}
public DistanceModel(Distance<Attribute, DistanceParameter>[] distances, MetaDistance metaDistance) {
public DistanceModel(Distance<Attribute, DistanceParameter>[] distances, MetaDistance metaDistance, Average<Attribute,AverageParameter>[] averages) {
this.distances = distances;
this.metaDistance = metaDistance;
this.averages = averages;
}
......@@ -54,13 +81,23 @@ public class DistanceModel {
public MetaDistance getMetaDistance() {
return metaDistance;
}
public void setMetaDistance(MetaDistance metaDistance) {
this.metaDistance = metaDistance;
}
public Average[] getAverages() {
return averages;
}
public void setAverages(Average[] averages) {
this.averages=averages;
}
/**
* Generate the default Model from one DataObject
* @param dataObject the DataObject from which it will create the default model
......@@ -68,31 +105,147 @@ public class DistanceModel {
*/
public static DistanceModel generateDefaultModel(DataObject dataObject) {
Distance[] distances=new Distance[dataObject.getNbAttributes()];
Average[] averages = new Average[dataObject.getNbAttributes()];
for (int i=0; i<distances.length;i++) {
if (dataObject.getAttribute(i) instanceof AttributeSequence)
distances[i] = DistanceDTW.getInstance();//uses DTW distances for sequential attributes ...
else if (dataObject.getAttribute(i) instanceof AttributeMultiDimSequence)
distances[i] = DistanceDTWMD.getInstance();//uses DTWMD distances for sequential attributes ...
else
distances[i]=NumericalEuclideanDistance.getInstance();//... and euclidian distances for numerical attributes
// if (dataObject.getAttribute(i) instanceof AttributeSequence) {
// distances[i] = DistanceDTW.getInstance();//uses DTW distances for sequential attributes ...
// }
if (dataObject.getAttribute(i) instanceof AttributeMultiDimSequence) {
distances[i] = DistanceDTWMD.getInstance();// uses DTWMD distances for sequential attributes ...
averages[i] = AverageMDDBAMean.getInstance();// uses MultiDim DBA mean for sequential attributes ...
}
else {
distances[i]=NumericalEuclideanDistance.getInstance();//... and euclidean distances for numerical attributes
averages[i] = AverageEuclidean.getInstance();// ... and classic euclidean mean for numerical attributes
}
}
MetaDistance metaDistance = MetaDistanceEuclidean.getInstance(); //defines the way the two scores are combined, by default it is Euclidian
DistanceModel model = new DistanceModel(distances, metaDistance);
MetaDistance metaDistance = MetaDistanceEuclidean.getInstance(); //defines the way the two scores are combined, by default it is euclidean
DistanceModel model = new DistanceModel(distances, metaDistance, averages);
return model;
}
/**
* Generate a naive model which only uses Euclidian distance
* @param dataObject the DataObject from which it will create a model
* Generate a naive model which only uses Euclidian distances and average methods
* @param dataObject the DataObject from which it will create the model
* @return the naive model
*/
public static DistanceModel generateNaiveModel(DataObject dataObject) {
Distance[] distances=new Distance[dataObject.getNbAttributes()];
Average[] averages = new Average[dataObject.getNbAttributes()];
for (int i=0; i<distances.length;i++) {
distances[i]=NumericalEuclideanDistance.getInstance();// uses euclidian distance for every attribute
if (dataObject.getAttribute(i) instanceof AttributeMultiDimSequence)
averages[i]=AverageMDeuclideanMean.getInstance();// uses MultiDim euclidean mean for sequential attributes ...
else
averages[i] = AverageEuclidean.getInstance();// and classic euclidean mean for numerical attributes
}
MetaDistance metaDistance = MetaDistanceEuclidean.getInstance(); // defines the way the two scores are combined, by default it is Euclidian
DistanceModel model = new DistanceModel(distances, metaDistance);
DistanceModel model = new DistanceModel(distances, metaDistance, averages);
return model;
}
/**
* returns the medoid from a set of DataObjects
* @param tabObj the set of objects
* @param parameters distance parameters to use
* @return the medoid of the set of objects
*/
public DataObject medoid(DataObject[] tabObj, DistanceParameter[] parameters) {
DataObject res = null;
MetaDistance metaDistance = this.getMetaDistance();
Distance<Attribute, DistanceParameter>[] distances = this.getDistances();
double sumDissimilarity = 0;
double minSumDissimilarity = Double.MAX_VALUE;
for(DataObject obj1 : tabObj) {
for (DataObject obj2 : tabObj) {
sumDissimilarity += metaDistance.compute(obj1,obj2,distances,parameters);
}
if (sumDissimilarity < minSumDissimilarity) {
res = obj1;
minSumDissimilarity = sumDissimilarity;
}
sumDissimilarity = 0;
}
return res;
}
public static void main(String[] args) {
int NB_OBJECTS = 10;
int SEQUENCE_LENGTH = 5;
DataObject[] objects = new DataObject[NB_OBJECTS];
Random randGenerator = new Random();
for (int i = 0; i < NB_OBJECTS; i++) { //~ Initialization of objects (random)
Attribute[] attributes = new Attribute[2];
if (randGenerator.nextDouble() < 0.5) { // first cluster
attributes[0] = new AttributeNumerical(randGenerator.nextGaussian() + .5);
double[][] sequence = new double[SEQUENCE_LENGTH][1];
for (int l = 0; l < sequence.length; l++) {
double p = l + randGenerator.nextGaussian();
sequence[l][0] = cos(p / SEQUENCE_LENGTH * 2 * PI);
}
attributes[1] = new AttributeMultiDimSequence(sequence);
} else {
attributes[0] = new AttributeNumerical(randGenerator.nextGaussian() - .5);
double[][] sequence = new double[SEQUENCE_LENGTH][1];
for (int l = 0; l < sequence.length; l++) {
double p = l + randGenerator.nextGaussian();
sequence[l][0] = sin(p / SEQUENCE_LENGTH * 2 * PI);
}
attributes[1] = new AttributeMultiDimSequence(sequence);
}
objects[i]= new DataObject(attributes);
}
Distance[] distances = new Distance[2]; // a distance is set for every attribute
Average[] averages = new Average[2];
distances[0] = NumericalEuclideanDistance.getInstance(); // first attribute compared with an euclidean distance between numericals
averages[0] = AverageEuclidean.getInstance();
distances[1] = DistanceDTWMD.getInstance(); // second attribute (sequential) compared with the DTW distance
averages[1] = AverageMDDBAMean.getInstance();
MetaDistance metaDistance = MetaDistanceEuclidean.getInstance(); // defines the way the two scores are combined (possibility to weight)
DistanceModel model = new DistanceModel(distances, metaDistance, averages);
// DistanceParameter[][] distanceParameters = new DistanceParameter[NB_THREADS][2];
// AverageParameter[][] averageParameters = new AverageParameter[NB_THREADS][2];
// for (int th = 0; th < NB_THREADS; th++) {
// distanceParameters[th][0] = EmptyDistanceParameter.getInstance(); //no parameter for a numerical euclidean distance
// distanceParameters[th][1] = new ParameterDTW(new double[SEQUENCE_LENGTH][SEQUENCE_LENGTH]); //but yes for DTW (requires a matrix to work in)
// averageParameters[th][0] = EmptyAverageParameter.getInstance();
// averageParameters[th][1] = new ParameterDBAMean(1025);
// }
DataObject center = objects[0];
Attribute[] tabAttr = new Attribute[NB_OBJECTS];
// DataObject medoid = model.medoid(objects, distanceParameters[0]);
//
// DataObject res = new DataObject(2);
// for (int i = 0; i < 2; i++) {
// for (int j = 0; j < NB_OBJECTS; j++) {
// tabAttr[j] = objects[j].getAttribute(i);
// }
// res.setAttribute(i, model.getAverages()[i].mean(tabAttr, center.getAttribute(i), averageParameters[0][i]));
// }
for (int j = 0; j < NB_OBJECTS; j++) {
tabAttr[j] = objects[j].getAttribute(1);
}
Attribute oldResAttrSeq = AttributeMultiDimSequence.mean(tabAttr, (AttributeMultiDimSequence)center.getAttribute(1));
Attribute newResAttrSeq = model.getAverages()[1].mean(tabAttr, center.getAttribute(1), new ParameterDBAMean(1025,15));
System.out.println("end");
}
}
......@@ -868,7 +868,7 @@ public class AttributeSequence extends Attribute {
} else {
return 2;
}
}
}
}
public static int getMode() {
......
package jcl.data.distance.average;
public interface Average {
}
import jcl.data.attribute.Attribute;
/**
* Generic Average interface
* @author Jean-Noël Balanche
*
* @param <A> Attribute
* @param <P> Parameters
*/
public interface Average <A extends Attribute, P extends AverageParameter> {
public abstract A mean(Attribute[] attributesTab, A oldCenter, P p);
}
\ No newline at end of file
package jcl.data.distance.average;
import jcl.data.attribute.Attribute;
import jcl.data.attribute.AttributeNumerical;
/**
* Class implementing the averaging method to use for numerical attributes
* @author Jean-Noël Balanche
*
*/
public class AverageEuclidean implements Average<AttributeNumerical,EmptyAverageParameter>{
/**
* Singleton
*/
private static AverageEuclidean instance = new AverageEuclidean();
/**
* Empty constructor
*/
private AverageEuclidean() {}
/**
* To get the singleton
* @return the unique instance
*/
public static AverageEuclidean getInstance() {
return instance;
}
@Override
public AttributeNumerical mean(Attribute[] attributesTab, AttributeNumerical oldCenter,
EmptyAverageParameter p) {
double sum = 0;
double card = 0;
for (Attribute a : attributesTab) {
AttributeNumerical a1 = (AttributeNumerical)a;
sum+=a1.getValue();
card++;
}
return new AttributeNumerical(sum/card);
}
}
package jcl.data.distance.average;
public interface AverageParameter {}
package jcl.data.distance.average;
/**
* Empty parameter used by for euclidean means
* @author balanche
*
*/
public class EmptyAverageParameter implements AverageParameter {
private static EmptyAverageParameter instance = new EmptyAverageParameter();
private EmptyAverageParameter(){}
public static EmptyAverageParameter getInstance(){
return instance;
}
}
package jcl.data.distance.average;
/**
* Parameters used by DBA mean
* @author Jean-Noël Balanche
*
*/
public class ParameterDBAMean implements AverageParameter{
public final static int RIEN = -1;
public final static int DIAGONALE = 0;
public final static int GAUCHE = 1;
public final static int HAUT = 2;
private int nbIterations = 15;
private double[][] matriceW;
/**
* Matrice stockant la meilleure direction dans chaque case de la matrice
*/
private int[][] matriceChoix;
/**
* Stocke pour chaque point de la matrice, la longueur du chemin optimal à partir de la case correspondante
*/
private int[][] optimalPathLength ;
/**
* Stocke l'éloignement cumulé du chemin par rapport à la diagonale. Utile pour DTWDistanceToDiagonal
*/
private double[][] distanceToDiagonalMatrix ;
public ParameterDBAMean(int maxSeqLength, int nbIterations) {
this.setMatriceW(new double[maxSeqLength][maxSeqLength]);
this.setMatriceChoix(new int[maxSeqLength][maxSeqLength]);
this.setOptimalPathLength(new int[maxSeqLength][maxSeqLength]);
this.setDistanceToDiagonalMatrix(new double[maxSeqLength][maxSeqLength]);
this.nbIterations = nbIterations;
}
public int getNbIterations() {
return nbIterations;
}
public void setNbIterations(int nbIterations) {
this.nbIterations = nbIterations;
}
/**
* @return the matriceChoix
*/
public int[][] getMatriceChoix() {
return matriceChoix;
}
/**
* @param matriceChoix the matriceChoix to set
*/
public void setMatriceChoix(int[][] matriceChoix) {
this.matriceChoix = matriceChoix;
}
/**
* @return the optimalPathLength
*/
public int[][] getOptimalPathLength() {
return optimalPathLength;
}
/**
* @param optimalPathLength the optimalPathLength to set
*/
public void setOptimalPathLength(int[][] optimalPathLength) {
this.optimalPathLength = optimalPathLength;
}
/**
* @return the distanceToDiagonalMatrix
*/
public double[][] getDistanceToDiagonalMatrix() {
return distanceToDiagonalMatrix;
}
/**
* @param distanceToDiagonalMatrix the distanceToDiagonalMatrix to set
*/
public void setDistanceToDiagonalMatrix(double[][] distanceToDiagonalMatrix) {
this.distanceToDiagonalMatrix = distanceToDiagonalMatrix;
}
/**
* @return the matriceW
*/
public double[][] getMatriceW() {
return matriceW;
}
/**
* @param matriceW the matriceW to set
*/
public void setMatriceW(double[][] matriceW) {
this.matriceW = matriceW;
}
}
package jcl.data.distance.average.sequential;
import static jcl.data.distance.Tools.distanceTo;
import jcl.data.attribute.Attribute;
import jcl.data.attribute.AttributeMultiDimSequence;
import jcl.data.distance.average.Average;
import jcl.data.distance.average.ParameterDBAMean;
import jcl.data.sequence.DoubleTabArrayList;
/**
* Class implementing the averaging method to use for sequential attributes.
* @author Jean-Noël Balanche
*
*/
public class AverageMDDBAMean implements Average<AttributeMultiDimSequence,ParameterDBAMean> {
/**
* Singleton
*/
private static AverageMDDBAMean instance = new AverageMDDBAMean();
/**
* Empty constructor
*/
private AverageMDDBAMean(){}
/**
* To get the singleton
* @return the unique instance
*/
public static AverageMDDBAMean getInstance(){
return instance;
}
@Override
public AttributeMultiDimSequence mean(Attribute[] attributesTab,
AttributeMultiDimSequence oldCenter, ParameterDBAMean p) {
AttributeMultiDimSequence res = null;
if (oldCenter != null) {
res = DBAMean(attributesTab, oldCenter, p);
} else {
int alea = (int) (Math.round(Math.random() * (attributesTab.length - 1)));
res = DBAMean(attributesTab, (AttributeMultiDimSequence)attributesTab[alea], p);
}
for (int i = 0; i < p.getNbIterations(); i++) {
res = DBAMean(attributesTab, res, p);
}
return res;
}
/**
* L'idée de cette méthode est de calculer la moyenne d'un ensemble de séquences, comme la mise-à-jour de l'ancienne moyenne, en calculant le
* barycentre des points ayant été associé à chaque point de cette dernière.
*
* @param oldCenter ancien centre du cluster
* @param attributesTab séquences associées au cluster
* @param p paramètres utilisés pour le calcul
* @return le nouveau centre, comme l'approximation des séquences
*/
public AttributeMultiDimSequence DBAMean(Attribute[] attributesTab,
AttributeMultiDimSequence oldCenter, ParameterDBAMean p) {
/**
* Cette liste contiendra la liste des tuples associés à chaque point de l'ancien centre
*/
final DoubleTabArrayList[] tupleAssociation = new DoubleTabArrayList[oldCenter.getNbTuples()];
for (int i = 0; i < tupleAssociation.length; i++) {
tupleAssociation[i] = new DoubleTabArrayList(attributesTab.length);
}
int nbTuplesAverageSeq, i, j, indiceRes;
double res = 0.0;
final int tailleCenter = oldCenter.getNbTuples();
int tailleT;
/**
* on construit les associations
*/
for (final Attribute TC : attributesTab) {
final AttributeMultiDimSequence T = (AttributeMultiDimSequence) TC;
if(T == null) {
break;
}
tailleT = T.getNbTuples();
// Remplissage première colonne et première ligne de la
// matrice
p.getMatriceW()[0][0] = distanceTo(oldCenter.sequence[0], T.sequence[0]);
p.getMatriceChoix()[0][0] = ParameterDBAMean.RIEN;
p.getOptimalPathLength()[0][0] = 0;
for (i = 1; i < tailleCenter; i++) {
p.getMatriceW()[i][0] = p.getMatriceW()[i - 1][0] + distanceTo(oldCenter.sequence[i], T.sequence[0]);
p.getMatriceChoix()[i][0] = ParameterDBAMean.HAUT;
p.getOptimalPathLength()[i][0] = i;
}
for (j = 1; j < tailleT; j++) {
p.getMatriceW()[0][j] = p.getMatriceW()[0][j - 1] + distanceTo(T.sequence[j], oldCenter.sequence[0]);
p.getMatriceChoix()[0][j] = ParameterDBAMean.GAUCHE;
p.getOptimalPathLength()[0][j] = j;
}
// Calcul de la matrice
for (i = 1; i < tailleCenter; i++) {
for (j = 1; j < tailleT; j++) {
indiceRes = AttributeMultiDimSequence.ArgMin3(p.getMatriceW()[i - 1][j - 1], p.getMatriceW()[i][j - 1], p.getMatriceW()[i - 1][j]);
p.getMatriceChoix()[i][j] = indiceRes;
switch (indiceRes) {
case ParameterDBAMean.DIAGONALE:
res = p.getMatriceW()[i - 1][j - 1];
p.getOptimalPathLength()[i][j] = p.getOptimalPathLength()[i - 1][j - 1] + 1;
break;
case ParameterDBAMean.GAUCHE:
res = p.getMatriceW()[i][j - 1];
p.getOptimalPathLength()[i][j] = p.getOptimalPathLength()[i][j - 1] + 1;
break;
case ParameterDBAMean.HAUT:
res = p.getMatriceW()[i - 1][j];
p.getOptimalPathLength()[i][j] = p.getOptimalPathLength()[i - 1][j] + 1;
break;
}
p.getMatriceW()[i][j] = res + distanceTo(oldCenter.sequence[i], T.sequence[j]);
}
}
/*
* +1 car la case contient le nb de 'flêches' dans la matrice, il faut ajouter 1 pour le nb d'éléments
*/
nbTuplesAverageSeq = p.getOptimalPathLength()[tailleCenter - 1][tailleT - 1] + 1;
i = tailleCenter - 1;
j = tailleT - 1;
for (int t = nbTuplesAverageSeq - 1; t >= 0; t--) {
tupleAssociation[i].add(T.sequence[j]);
switch (p.getMatriceChoix()[i][j]) {
case ParameterDBAMean.DIAGONALE:
i = i - 1;
j = j - 1;
break;
case ParameterDBAMean.GAUCHE:
j = j - 1;
break;
case ParameterDBAMean.HAUT:
i = i - 1;
break;
}
}
}
final double[][] tuplesAverageSeq = new double[tailleCenter][oldCenter.getTupleDimension()];
for (int t = 0; t < tailleCenter; t++) {
tuplesAverageSeq[t] = moyenne(tupleAssociation[t].toArray());
}
final AttributeMultiDimSequence newCenter = new AttributeMultiDimSequence(tuplesAverageSeq);
return newCenter;
}
public static double[] moyenne(final double[]... tab) {
double[] res = new double[tab[0].length];
for (int j = 0; j < tab[0].length; j++) {
res[j] = 0.0;
}
for (int i = 0; i < tab.length; i++) {
for (int j = 0; j < tab[0].length; j++) {
res[j] += tab[i][j];
}
}
for (int j = 0; j < tab[0].length; j++) {
res[j] /= tab.length;
}
return res;
}
}
package jcl.data.distance.average.sequential;
import java.util.Arrays;
import jcl.data.attribute.Attribute;
import jcl.data.attribute.AttributeMultiDimSequence;
import jcl.data.distance.average.Average;
import jcl.data.distance.average.EmptyAverageParameter;
/**
* Class implementing the euclidean mean for sequential attributes
* @author Jean-Noël Balanche
*
*/
public class AverageMDeuclideanMean implements Average<AttributeMultiDimSequence,EmptyAverageParameter> {
/**
* Singleton
*/
private static AverageMDeuclideanMean instance = new AverageMDeuclideanMean();
/**
* Empty constructor
*/
private AverageMDeuclideanMean(){}
/**
* To get the singleton
* @return the unique instance
*/
public static AverageMDeuclideanMean getInstance(){
return instance;
}
@Override
public AttributeMultiDimSequence mean(Attribute[] attributesTab,
AttributeMultiDimSequence oldCenter, EmptyAverageParameter p) {