library.py 9.63 KB
Newer Older
1
from math import floor
2
from numpy import size
Alain Shakour's avatar
Alain Shakour committed
3
def ca_set_binary_threshold_from_skew(input_dict):
4
    import math
Alain Shakour's avatar
Alain Shakour committed
5 6 7 8
    cost_false_pos = input_dict['cost_false_pos']
    cost_false_neg = input_dict['cost_false_neg']
    ratio_pos_neg = input_dict['ratio_pos_neg']
    output_dict = {}
9
    output_dict['bin_thres'] =  math.log10(float(ratio_pos_neg) * (float(cost_false_neg) / float(cost_false_pos)))
Alain Shakour's avatar
Alain Shakour committed
10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
    return output_dict

def ca_estimate_pos_neg_from_prd_fct(input_dict):
    import re
    output_dict = {}
    deploy_data = input_dict['deploy_data']
    target_att = input_dict['target_att']
    pos_col = input_dict['pos_col']
    neg_col = input_dict['neg_col']

    
    with open(deploy_data) as f:
        deploy_file = f.read()
        
    pos_arr = re.findall(target_att+"\(.*," +pos_col+"\)\.", deploy_file)
    print len(pos_arr)

    neg_arr = re.findall(target_att+"\(.*," +neg_col+"\)\.", deploy_file)
    print len(neg_arr)
    
    output_dict['ratio_pos_neg'] = len(pos_arr)/float(len(neg_arr))  
    return output_dict

def ca_apply_binary_threshold(input_dict):
    performance = input_dict['score']
    thres = input_dict['bin_thres']
    
    n = len(performance['predicted'])
    for i in range(n):
        if performance['predicted'][i] >= thres:
            performance['predicted'][i] = 1
        else:
            performance['predicted'][i] = 0
                
    output_dict = {}
    output_dict['classes'] = performance
    return output_dict

Alain Shakour's avatar
Alain Shakour committed
48
def ca_rank_driven_binary_threshold_selection(input_dict):
Alain Shakour's avatar
Alain Shakour committed
49 50 51
    from collections import Counter

    performance = input_dict['score']
52
    rate = input_dict['rate']
Alain Shakour's avatar
Alain Shakour committed
53 54 55 56 57 58 59
    list_score = []
    labels = ''
    n = len(performance['actual'])
    for i in range(n):
        list_score.append((performance['actual'][i],performance['predicted'][i]))
    output_dict = {}
    sorted_score = sorted(list_score, key=lambda scr: scr[1],reverse=True)
60 61 62 63 64 65

    rank = floor(n * (float(rate) / float(100)))
    current_rank = 0
    previous = float('inf')
    current = previous
    for i in range(n):
66
        current = sorted_score[i][1]
67 68 69 70
        current_rank = current_rank + 1        
        if current_rank > rank:
            output_dict['bin_thres'] = (previous + current) / float(2)                     
            break
71
        previous = sorted_score[i][1]
Alain Shakour's avatar
Alain Shakour committed
72 73
    return output_dict

Alain Shakour's avatar
Alain Shakour committed
74
def ca_optimal_binary_threshold_selection(input_dict):
Alain Shakour's avatar
Alain Shakour committed
75 76 77
    from collections import Counter
    performance = input_dict['score']
    method = input_dict['method']
78
    #print method
Alain Shakour's avatar
Alain Shakour committed
79 80 81 82 83 84 85 86
    list_score = []
    labels = ''
    n = len(performance['actual'])
    for i in range(n):
        list_score.append((performance['actual'][i],performance['predicted'][i]))
    output_dict = {}
    sorted_score = sorted(list_score, key=lambda scr: scr[1],reverse=True)
    counter_neg = len([score for score in list_score if score[0] == 0])
87
    #print counter_neg
Alain Shakour's avatar
Alain Shakour committed
88
    counter_pos = len([score for score in list_score if score[0] == 1])
89
    #print counter_pos
90
    output_dict['bin_thres'] = find_best_roc_weight(method,sorted_score,counter_pos,counter_neg)        
Alain Shakour's avatar
Alain Shakour committed
91 92
    return output_dict

93
def find_best_roc_weight(method,a_list,a_num_positives,a_num_negatives):
Alain Shakour's avatar
Alain Shakour committed
94 95 96
    previous = float('inf')
    xpos = 0
    xneg = a_num_negatives
97
    the_best_value =     get_value(method,xpos,xneg,a_num_positives,a_num_negatives)
Alain Shakour's avatar
Alain Shakour committed
98 99 100 101 102
    best = previous
    for the_elt in a_list:
        the_roc = the_elt
        current = the_roc[1]
        if current != previous:
103 104
            possible_best_value = get_value(method,xpos,xneg,a_num_positives,a_num_negatives)
            print '%f > %f' %(possible_best_value,the_best_value)
Alain Shakour's avatar
Alain Shakour committed
105 106
            if  possible_best_value > the_best_value:
                the_best_value = possible_best_value
107
                print '%f -> %f' %(best,(previous + current) / float(2))
Alain Shakour's avatar
Alain Shakour committed
108 109 110
                best = (previous + current) / float(2)
        if the_roc[0] == 1:
            xpos += 1
111 112 113 114 115
        else:
            xneg -= 1
        previous = current;        

    possible_best_value = get_value(method,xpos,xneg,a_num_positives,a_num_negatives)
Alain Shakour's avatar
Alain Shakour committed
116 117
    if  possible_best_value > the_best_value:
        the_best_value = possible_best_value
118
        best = (previous + float('-inf')) / float(2)   
Alain Shakour's avatar
Alain Shakour committed
119 120
    return best

121
def get_value(method, TP, TN, P, N):
Alain Shakour's avatar
Alain Shakour committed
122
    if method == 'accuracy':
123
        accuracy = (TP + TN) / float(P+N)
Alain Shakour's avatar
Alain Shakour committed
124 125 126 127 128 129
        return accuracy
    elif method == 'balanced':
        balanced = ( TP / float(P) + TN / float(N)) / 2
        return balanced
    FN = P - TP
    FP = N - TN
130 131 132 133
    recall = TP / float(P)
    if method == 'recall':
        return recall
    if TP + FP > 0:
134
        precision = TP / float(TP + FP)
Alain Shakour's avatar
Alain Shakour committed
135 136 137 138 139 140 141 142
        if method == 'precision':
            return precision
        if precision + recall > 0:
            F_measure = 2 * precision * recall / (precision + recall)
        else:
            F_measure = 0
    else:
        F_measure = 0
143 144 145 146 147 148 149 150
    return F_measure

def Context_Bcalibration(input_dict):
    non_calibrated_scores=input_dict['non_calibrated_scores']
    learner=input_dict['learner']
    old_actual=non_calibrated_scores['actual']
    old_predicted=non_calibrated_scores['predicted']
    g2=sort_list2(old_predicted,old_actual)
151
    #print g2
152
    Z=g2['actual']
153
    #print len(Z)
154 155
    #print Z
    Z2=g2['predicted']
156
    
157 158 159 160
    output_dict={}
    list3=[]
    for i in Z:
        list3.append(i)
161
    #print list3
162
    L=search_for_position(list3)
163 164 165
    while non_decreasing(L)!=True:
         L=search_for_position(L)

166 167 168 169 170 171 172
    g2=sort_list2(old_predicted,old_actual)
    output_dict['builded_scores']= { 'noncalibrated_scr':Z2,'class':Z,'calibrated_scr':L}
    print output_dict
    return output_dict 


def Context_Acalibration(input_dict):
173
        from numpy import inf
174 175 176 177 178
        
        import math
        dict=input_dict['test_scores']
        sc=input_dict['builded_scores']
        X1=sc['calibrated_scr']
179 180
        #print len(X1)
        #print dict['predicted']
181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221
        
        k=0
        
        X2=sc['noncalibrated_scr']
        
        probs=[] 
        list_scr=[]
        list_max=[]
        test_cls=dict['actual']
        test_data=dict['predicted']
        i=1
        begin_score=[]
        end_score=[]
        begin_score.append(X2[0])
        i=0
        for item in X1[1:]:
            if (item != X1[i]) and (i<=size(X1)-1):
                    begin_score.append(X2[i+1])
                    end_score.append(X2[i])
            i+=1   
        end_score.append(X2[size(X2)-1])  
        max=0
        i=0
        aux =True
        #list_max.append(0)
        if 0 in X1:
            list_max.append(0)
        for j in X1:
            if (j>max):
                list_max.append(j)
                max=j
            
        
        for  scr in dict['predicted']:
            k+=1
            scr2=0
            for i in range(len(begin_score)):
                    if (scr >=begin_score[i] and scr<=end_score[i])or(scr==begin_score[i])or(scr==end_score[i]):
                        
                        
                        scr2=list_max[i]
222
                       
223 224 225 226 227 228 229 230 231 232
                        
                    elif scr>end_score[i]and scr<begin_score[i+1]:
                            
                            val1=(scr-end_score[i])/(begin_score[i+1]-end_score[i])
                            
                            scr2 = list_max[i]+((list_max[i+1]-list_max[i])*val1)
                            
            list_scr.append(scr2)
        print list_scr                             
        for j in list_scr:
233 234 235 236 237 238 239 240
            #print j
            if j==0:
                probs.append(float(-inf))
            elif  j==1:
                probs.append(float(+inf))
            else:
                probs.append(math.log10(j/(1-j)))  
        #print len(test_cls)
241 242 243
        output_dict={}  
        output_dict['calibrated_scores']= {'actual':test_cls , 'predicted':probs}
        return output_dict
244
   
245 246 247
def search_for_position(list):
    l1=list
    l2=l1
248
    #print l2
249 250 251
    new_dict={}
    new_dict['actual']=l2
    for i in range(1,len(l1)):
252
        print 'etape n%s '%i
253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275
        k=2
        list2=[]
        list3=[]
        
        aux=True
        var=True
        j=1
        if l1[i]<l1[i-1] :
            j+=1
            print j
            list2.append(l1[i])
            list2.append(l1[i-1])
            list3.append(i-1)
            list3.append(i)
            for n in range(2,i):
                if l1[i-1]==l1[i-n] and var==True and i!=1:
                    j+=1
                    list2.append(l1[i-n])
                    list3.append(i-n)
                    aux=False
                else:
                    var=False
            if aux==False:
276 277 278 279 280 281 282
                    c=sum(list2)
                    #print c
                    #print list3
                    
                    for z in list3:
                        print z
                        l1[z]=float(c)/j
283
            else:
284 285 286 287 288 289 290 291
                    print list2
                    #list[i-1]=list[i]=(list[i]+list[i-1]/j)
                    x=((float(l1[i]+l1[i-1]))/2)
                    l1[i]=l1[i-1]=x
                    print j 
                    print x
     
    print new_dict 
292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313
    return  l1
def sort_list2(list1,list2):
    n= len(list1)
    Dict={}
    for i in range(n-1):
        #print i
        k=i
        min=list1[k]
        #print min
        for j in range(i+1,n): 
            #print j
            if list1[j]<min:
                min=list1[j]
                pos=j
        #print '%shere is min'%min
        if min!=list1[k]:
            aux1=list1[i]
            list1[i]=list1[pos]
            list1[pos]=aux1
            aux2=list2[i]
            list2[i]=list2[pos]
            list2[pos]=aux2
314
    #print list1
315 316
    Dict={'actual':list2,'predicted':list1}
    return Dict
317 318 319 320


def non_decreasing(L):
    return all(x<=y for x, y in zip(L, L[1:]))