Commit 9b785640 authored by romanorac's avatar romanorac

support for new version of discomll

parent 18152626
......@@ -17,4 +17,3 @@ liac-arff==2.0.1
networkx==1.9.1
djangorestframework==3.0.3
django-filter==0.9.1
discomll
def file_url(input_dict):
from discomll import dataset
......@@ -9,18 +8,17 @@ def file_url(input_dict):
for url in urls:
if url[0].split("://")[0] == "https":
raise Exception("Dataset should be accessible over HTTP.")
del(input_dict["url"])
del (input_dict["url"])
X_indices_splited = input_dict["X_indices"].replace(" ","").split("-")
X_indices_splited = input_dict["X_indices"].replace(" ", "").split("-")
if len(X_indices_splited) == 2:
a, b = X_indices_splited
if not a.isdigit() or not b.isdigit():
raise Exception("Feature indices should be integers. Example: 1-10")
X_indices = range(int(a), int(b))
else:
X_indices = [int(v) for v in input_dict["X_indices"].replace(" ","").split(",") if v != ""]
del(input_dict["X_indices"])
X_indices = [int(v) for v in input_dict["X_indices"].replace(" ", "").split(",") if v != ""]
del (input_dict["X_indices"])
input_dict["data_type"] = "gzip" if input_dict["data_type"] == "true" else ""
......@@ -31,18 +29,18 @@ def file_url(input_dict):
else:
X_meta = input_dict["custom"]
data = dataset.Data(data_tag = urls,
X_indices = X_indices,
X_meta = X_meta,
generate_urls = True if input_dict["range"] == "true" else False,
data = dataset.Data(data_tag=urls,
X_indices=X_indices,
X_meta=X_meta,
generate_urls=True if input_dict["range"] == "true" else False,
**input_dict)
print data.params
return {"dataset" : data}
return {"dataset": data}
def big_data_apply_classifier(input_dict):
def big_data_apply_classifier(input_dict):
if "naivebayes_fitmodel" in input_dict["fitmodel_url"]:
return naivebayes_predict(input_dict)
elif "logreg_fitmodel" in input_dict["fitmodel_url"]:
......@@ -60,206 +58,224 @@ def big_data_apply_classifier(input_dict):
elif "linreg_fitmodel" in input_dict["fitmodel_url"]:
return linreg_predict(input_dict)
def lwlr_fit_predict(input_dict):
from discomll.regression import locally_weighted_linear_regression
predictions_url = locally_weighted_linear_regression.fit_predict(
fitting_data = input_dict["fitting_dataset"],
training_data = input_dict["training_dataset"],
tau = input_dict["tau"],
save_results = True)
fitting_data=input_dict["fitting_dataset"],
training_data=input_dict["training_dataset"],
tau=input_dict["tau"],
save_results=True)
return {"string": predictions_url}
def dt_fit(input_dict):
from discomll.ensemble import forest_distributed_decision_trees
random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"])
fitmodel_url = forest_distributed_decision_trees.fit(input_dict["dataset"],
trees_per_chunk = input_dict["trees_per_subset"],
max_tree_nodes = input_dict["tree_nodes"],
min_samples_leaf = input_dict["min_samples_leaf"],
min_samples_split = input_dict["min_samples_split"],
class_majority = input_dict["majority"],
bootstrap = input_dict["bootstrap"] == "true",
measure = input_dict["measure"],
accuracy = input_dict["accuracy"],
separate_max = input_dict["separate_max"],
random_state = random_state,
save_results = True)
return {"fitmodel_url" : fitmodel_url}
trees_per_chunk=input_dict["trees_per_subset"],
max_tree_nodes=input_dict["tree_nodes"],
min_samples_leaf=input_dict["min_samples_leaf"],
min_samples_split=input_dict["min_samples_split"],
class_majority=input_dict["majority"],
bootstrap=input_dict["bootstrap"] == "true",
measure=input_dict["measure"],
accuracy=input_dict["accuracy"],
separate_max=input_dict["separate_max"] == "true",
random_state=random_state,
save_results=True)
return {"fitmodel_url": fitmodel_url}
def dt_predict(input_dict):
from discomll.ensemble import forest_distributed_decision_trees
predictions_url = forest_distributed_decision_trees.predict(input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"],
save_results = True)
fitmodel_url=input_dict["fitmodel_url"],
save_results=True)
return {"string": predictions_url}
def rf_fit(input_dict):
from discomll.ensemble import distributed_random_forest
random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"])
fitmodel_url = distributed_random_forest.fit(input_dict["dataset"],
trees_per_chunk = input_dict["trees_per_subset"],
max_tree_nodes = input_dict["tree_nodes"],
min_samples_leaf = input_dict["min_samples_leaf"],
min_samples_split = input_dict["min_samples_split"],
class_majority = input_dict["majority"],
measure = input_dict["measure"],
accuracy = input_dict["accuracy"],
separate_max = input_dict["separate_max"],
random_state = random_state,
save_results = True)
return {"fitmodel_url" : fitmodel_url}
trees_per_chunk=input_dict["trees_per_subset"],
max_tree_nodes=input_dict["tree_nodes"],
min_samples_leaf=input_dict["min_samples_leaf"],
min_samples_split=input_dict["min_samples_split"],
class_majority=input_dict["majority"],
measure=input_dict["measure"],
accuracy=input_dict["accuracy"],
separate_max=input_dict["separate_max"] == "true",
random_state=random_state,
save_results=True)
return {"fitmodel_url": fitmodel_url}
def rf_predict(input_dict):
from discomll.ensemble import distributed_random_forest
random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"])
predictions_url = distributed_random_forest.predict(input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"],
random_state = random_state,
save_results = True)
fitmodel_url=input_dict["fitmodel_url"],
save_results=True)
return {"string": predictions_url}
def wrf_fit(input_dict):
from discomll.ensemble import distributed_weighted_forest_rand
random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"])
fitmodel_url = distributed_weighted_forest_rand.fit(input_dict["dataset"],
trees_per_chunk = input_dict["trees_per_subset"],
max_tree_nodes = input_dict["tree_nodes"],
num_medoids = input_dict["num_medoids"],
min_samples_leaf = input_dict["min_samples_leaf"],
min_samples_split = input_dict["min_samples_split"],
class_majority = input_dict["majority"],
measure = input_dict["measure"],
accuracy = input_dict["accuracy"],
separate_max = input_dict["separate_max"],
random_state = random_state,
save_results = True)
return {"fitmodel_url" : fitmodel_url}
trees_per_chunk=input_dict["trees_per_subset"],
max_tree_nodes=input_dict["tree_nodes"],
num_medoids=input_dict["num_medoids"],
min_samples_leaf=input_dict["min_samples_leaf"],
min_samples_split=input_dict["min_samples_split"],
class_majority=input_dict["majority"],
measure=input_dict["measure"],
accuracy=input_dict["accuracy"],
separate_max=input_dict["separate_max"] == "true",
random_state=random_state,
save_results=True)
return {"fitmodel_url": fitmodel_url}
def wrf_predict(input_dict):
from discomll.ensemble import distributed_weighted_forest_rand
predictions_url = distributed_weighted_forest_rand.predict(input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"],
coeff = input_dict["coeff"],
save_results = True)
fitmodel_url=input_dict["fitmodel_url"],
coeff=input_dict["coeff"],
save_results=True)
return {"string": predictions_url}
def linsvm_fit(input_dict):
from discomll.classification import linear_svm
fitmodel_url = linear_svm.fit(input_dict["dataset"],
nu = input_dict["nu"],
save_results = True)
return {"fitmodel_url" : fitmodel_url}
nu=input_dict["nu"],
save_results=True)
return {"fitmodel_url": fitmodel_url}
def linsvm_predict(input_dict):
from discomll.classification import linear_svm
predictions_url = linear_svm.predict(input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"],
save_results = True)
fitmodel_url=input_dict["fitmodel_url"],
save_results=True)
return {"string": predictions_url}
def linreg_fit(input_dict):
from discomll.regression import linear_regression
fitmodel_url = linear_regression.fit(input_dict["dataset"],
save_results = True)
save_results=True)
return {"fitmodel_url": fitmodel_url}
return {"fitmodel_url" : fitmodel_url}
def linreg_predict(input_dict):
from discomll.regression import linear_regression
predictions_url = linear_regression.predict(input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"],
save_results = True)
fitmodel_url=input_dict["fitmodel_url"],
save_results=True)
return {"string": predictions_url}
def kmeans_fit(input_dict):
from discomll.clustering import kmeans
random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"])
fitmodel_url = kmeans.fit(input_dict["dataset"],
n_clusters = input_dict["clusters"],
max_iterations = input_dict["itr"],
random_state = random_state,
save_results = True)
n_clusters=input_dict["clusters"],
max_iterations=input_dict["itr"],
random_state=random_state,
save_results=True)
return {"fitmodel_url": fitmodel_url}
return {"fitmodel_url" : fitmodel_url}
def kmeans_predict(input_dict):
from discomll.clustering import kmeans
predictions_url = kmeans.predict(input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"],
save_results = True)
fitmodel_url=input_dict["fitmodel_url"],
save_results=True)
return {"string": predictions_url}
def logreg_fit(input_dict):
from discomll.classification import logistic_regression
fitmodel_url = logistic_regression.fit(input_dict["dataset"],
alpha = input_dict["alpha"],
max_iterations = input_dict["itr"],
save_results = True)
return {"fitmodel_url" : fitmodel_url}
alpha=input_dict["alpha"],
max_iterations=input_dict["itr"],
save_results=True)
return {"fitmodel_url": fitmodel_url}
def logreg_predict(input_dict):
from discomll.classification import logistic_regression
predictions_url = logistic_regression.predict(input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"],
save_results = True)
fitmodel_url=input_dict["fitmodel_url"],
save_results=True)
return {"string": predictions_url}
def naivebayes_fit(input_dict):
from discomll.classification import naivebayes
fitmodel_url = naivebayes.fit(input_dict["dataset"], save_results = True)
fitmodel_url = naivebayes.fit(input_dict["dataset"], save_results=True)
return {"fitmodel_url": fitmodel_url}
return {"fitmodel_url" : fitmodel_url}
def naivebayes_predict(input_dict):
from discomll.classification import naivebayes
m = 1 if input_dict["m"] == "" else input_dict["m"]
predictions_url = naivebayes.predict(input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"],
m = input_dict["m"],
save_results = True )
fitmodel_url=input_dict["fitmodel_url"],
m=input_dict["m"],
save_results=True)
return {"string": predictions_url}
def results_to_file(input_dict):
#implementation is in visualization_views.py
# implementation is in visualization_views.py
return {}
def measure_distribution(input_dict):
#implementation is in visualization_views.py
# implementation is in visualization_views.py
return {}
def model_view(input_dict):
#implementation is in visualization_views.py
# implementation is in visualization_views.py
return {}
def bigdata_ca(input_dict):
#implementation is in visualization_views.py
# implementation is in visualization_views.py
return {}
def bigdata_mse(input_dict):
#implementation is in visualization_views.py
# implementation is in visualization_views.py
return {}
......@@ -5,7 +5,7 @@
"category": "1be7b5eb-c1b2-485a-8dbe-56abce63fc73",
"treeview_image": "",
"uid": "4f2ce923-62e6-4be1-a394-72ac52988386",
"is_streaming": false,
"windows_queue": false,
"package": "big_data",
"interaction_view": "",
"has_progress_bar": false,
......@@ -19,7 +19,7 @@
"wsdl_method": "",
"wsdl": "",
"interactive": false,
"windows_queue": false,
"is_streaming": false,
"order": 1,
"name": "Distributed Random Forest"
}
......@@ -171,7 +171,7 @@
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "leaf_min_inst",
"variable": "min_samples_split",
"parameter": true,
"order": 4,
"uid": "a2f366a9-af74-4b3f-90ed-33c1fcad4c3a"
......
......@@ -5,7 +5,7 @@
"category": "1be7b5eb-c1b2-485a-8dbe-56abce63fc73",
"treeview_image": "",
"uid": "72a39fab-5433-493f-ae22-12a264075356",
"is_streaming": false,
"windows_queue": false,
"package": "big_data",
"interaction_view": "",
"has_progress_bar": false,
......@@ -19,7 +19,7 @@
"wsdl_method": "",
"wsdl": "",
"interactive": false,
"windows_queue": false,
"is_streaming": false,
"order": 1,
"name": "Distributed Weighted Forest"
}
......@@ -205,7 +205,7 @@
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "leaf_min_inst",
"variable": "min_samples_split",
"parameter": true,
"order": 5,
"uid": "ac032f38-f4a4-44ea-8c02-96506d4f8e86"
......
......@@ -5,7 +5,7 @@
"category": "08c90036-fd13-4ff3-ac20-01982262a782",
"treeview_image": "",
"uid": "d9de579e-fdd9-47da-948e-69183f24340f",
"is_streaming": false,
"windows_queue": false,
"package": "big_data",
"interaction_view": "",
"has_progress_bar": false,
......@@ -19,7 +19,7 @@
"wsdl_method": "",
"wsdl": "",
"interactive": false,
"windows_queue": false,
"is_streaming": false,
"order": 1,
"name": "Apply Classifier"
}
......@@ -37,7 +37,7 @@
"parameter_type": null,
"variable": "dataset",
"parameter": false,
"order": 5,
"order": 4,
"uid": "0c174cfd-0d61-44ff-8ce2-9c32f8791293"
}
},
......@@ -58,23 +58,6 @@
"uid": "46e9a02d-a613-4a01-a8ee-51d578a32fea"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "d9de579e-fdd9-47da-948e-69183f24340f",
"name": "Random forest - random state",
"short_name": "rds",
"default": "None",
"description": "Define a random state for predict phase.",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "seed",
"parameter": true,
"order": 3,
"uid": "8ec2b906-2b9e-4cda-9455-09ccd7d134fb"
}
},
{
"model": "workflows.abstractinput",
"fields": {
......@@ -96,7 +79,7 @@
"model": "workflows.abstractinput",
"fields": {
"widget": "d9de579e-fdd9-47da-948e-69183f24340f",
"name": "Weighted forest - similarity coeff",
"name": "Distributed Weighted forest - similarity coeff",
"short_name": "coe",
"default": "0.5",
"description": "Percentage of most similar treees to include in prediction (0 - 1)",
......@@ -105,7 +88,7 @@
"parameter_type": "text",
"variable": "coeff",
"parameter": true,
"order": 4,
"order": 3,
"uid": "d979ff23-eb11-40cf-9d81-2a71ddf5d790"
}
},
......
from django.shortcuts import render
def measure_distribution(request,input_dict,output_dict,widget):
def measure_distribution(request, input_dict, output_dict, widget):
from disco.core import result_iterator
from discomll.utils import distribution
......@@ -11,9 +12,11 @@ def measure_distribution(request,input_dict,output_dict,widget):
dist += str(k.split("/")[-1][:-2]) + " " + str(v) + "\n"
input_dict["string"] = dist
return render(request, 'visualizations/display_string.html',{'widget':widget,'input_dict':input_dict,'output_dict':output_dict})
return render(request, 'visualizations/display_string.html',
{'widget': widget, 'input_dict': input_dict, 'output_dict': output_dict})
def bigdata_ca(request,input_dict,output_dict,widget):
def bigdata_ca(request, input_dict, output_dict, widget):
from discomll.utils import accuracy
import os.path
from mothra.settings import MEDIA_ROOT
......@@ -21,70 +24,72 @@ def bigdata_ca(request,input_dict,output_dict,widget):
folder = 'discomll_measures'
tag = input_dict["predictions"]
destination = MEDIA_ROOT+'/'+folder+"/"+tag[0][6:]+'.txt'
destination = MEDIA_ROOT + '/' + folder + "/" + tag[0][6:] + '.txt'
ensure_dir(destination)
if input_dict["dataset"].params["id_index"] == -1:
input_dict["string"] = "ID index should be defined."
elif not os.path.isfile(destination): #file doesnt exists
results = accuracy.measure(test_data = input_dict["dataset"],
predictions = input_dict["predictions"],
measure = "ca")
elif not os.path.isfile(destination): # file doesnt exists
measure, acc = accuracy.measure(test_data=input_dict["dataset"],
predictions=input_dict["predictions"],
measure="ca")
string = "Classification Accuracy \n"
for k, v in results:
string += str(v) + "\n"
score = str(measure) + " " + str(acc) + "\n"
string += score
input_dict["string"] = string
f = open(destination,'w')
f.write(str(v))
f = open(destination, 'w')
f.write(score)
f.close()
else:
#ca results are cached
string = "Classification Accuracy \n"
f = open(destination,'r')
f = open(destination, 'r')
input_dict["string"] = string + str(f.readlines()[0])
f.close()
return render(request, 'visualizations/display_string.html',{'widget':widget,'input_dict':input_dict,'output_dict':output_dict})
return render(request, 'visualizations/display_string.html',
{'widget': widget, 'input_dict': input_dict, 'output_dict': output_dict})
def bigdata_mse(request,input_dict,output_dict,widget):
def bigdata_mse(request, input_dict, output_dict, widget):
from discomll.utils import accuracy
from disco.core import result_iterator
import os.path
from mothra.settings import MEDIA_ROOT
from workflows.helpers import ensure_dir
folder = 'discomll_measures'
tag = input_dict["predictions"]
destination = MEDIA_ROOT+'/'+folder+"/"+tag[0][6:]+'.txt'
destination = MEDIA_ROOT + '/' + folder + "/" + tag[0][6:] + '.txt'
ensure_dir(destination)
if input_dict["dataset"].params["id_index"] == -1:
input_dict["string"] = "ID index should be defined."
elif not os.path.isfile(destination): #file doesnt exists
results = accuracy.measure(test_data = input_dict["dataset"],
predictions = input_dict["predictions"],
measure = "mse")
elif not os.path.isfile(destination): # file doesnt exists
measure, acc = accuracy.measure(test_data=input_dict["dataset"],
predictions=input_dict["predictions"],
measure="mse")
string = "Mean squared error\n"
for k, v in results:
string += str(v) + "\n"
score = str(measure) + " " + str(acc) + "\n"
string += score
input_dict["string"] = string
f = open(destination,'w')
f.write(str(v))
f = open(destination, 'w')
f.write(score)
f.close()
else:
string = "Mean squared error\n"
f = open(destination,'r')
f = open(destination, 'r')
input_dict["string"] = string + str(f.readlines()[0])
f.close()
return render(request, 'visualizations/display_string.html',{'widget':widget,'input_dict':input_dict,'output_dict':output_dict})
return render(request, 'visualizations/display_string.html',
{'widget': widget, 'input_dict': input_dict, 'output_dict': output_dict})
def results_to_file(request,input_dict,output_dict,widget):
def results_to_file(request, input_dict, output_dict, widget):
from disco.core import result_iterator
import os.path
from mothra.settings import MEDIA_ROOT
......@@ -94,13 +99,13 @@ def results_to_file(request,input_dict,output_dict,widget):
folder = 'discomll_results'
add = "add" if input_dict["add_params"] == "true" else ""
destination = MEDIA_ROOT+'/'+folder+"/"+tag[0][6:]+add+'.txt'
destination = MEDIA_ROOT + '/' + folder + "/" + tag[0][6:] + add + '.txt'
ensure_dir(destination)