Commit 9b785640 authored by romanorac's avatar romanorac

support for new version of discomll

parent 18152626
...@@ -17,4 +17,3 @@ liac-arff==2.0.1 ...@@ -17,4 +17,3 @@ liac-arff==2.0.1
networkx==1.9.1 networkx==1.9.1
djangorestframework==3.0.3 djangorestframework==3.0.3
django-filter==0.9.1 django-filter==0.9.1
discomll
def file_url(input_dict): def file_url(input_dict):
from discomll import dataset from discomll import dataset
if input_dict["range"] == "true": if input_dict["range"] == "true":
urls = [url.strip() for url in input_dict["url"].split("\n") if url != ""] urls = [url.strip() for url in input_dict["url"].split("\n") if url != ""]
else: else:
urls = [[url.strip()] for url in input_dict["url"].split("\n") if url != ""] urls = [[url.strip()] for url in input_dict["url"].split("\n") if url != ""]
for url in urls: for url in urls:
if url[0].split("://")[0] == "https": if url[0].split("://")[0] == "https":
raise Exception("Dataset should be accessible over HTTP.") raise Exception("Dataset should be accessible over HTTP.")
del(input_dict["url"]) del (input_dict["url"])
X_indices_splited = input_dict["X_indices"].replace(" ","").split("-") X_indices_splited = input_dict["X_indices"].replace(" ", "").split("-")
if len(X_indices_splited) == 2: if len(X_indices_splited) == 2:
a, b = X_indices_splited a, b = X_indices_splited
if not a.isdigit() or not b.isdigit(): if not a.isdigit() or not b.isdigit():
raise Exception("Feature indices should be integers. Example: 1-10") raise Exception("Feature indices should be integers. Example: 1-10")
X_indices = range(int(a), int(b)) X_indices = range(int(a), int(b))
else: else:
X_indices = [int(v) for v in input_dict["X_indices"].replace(" ","").split(",") if v != ""] X_indices = [int(v) for v in input_dict["X_indices"].replace(" ", "").split(",") if v != ""]
del(input_dict["X_indices"]) del (input_dict["X_indices"])
input_dict["data_type"] = "gzip" if input_dict["data_type"] == "true" else "" input_dict["data_type"] = "gzip" if input_dict["data_type"] == "true" else ""
...@@ -29,20 +27,20 @@ def file_url(input_dict): ...@@ -29,20 +27,20 @@ def file_url(input_dict):
elif input_dict["atr_meta"] == "discrete": elif input_dict["atr_meta"] == "discrete":
X_meta = ["d" for i in range(len(X_indices))] X_meta = ["d" for i in range(len(X_indices))]
else: else:
X_meta = input_dict["custom"] X_meta = input_dict["custom"]
data = dataset.Data(data_tag = urls, data = dataset.Data(data_tag=urls,
X_indices = X_indices, X_indices=X_indices,
X_meta = X_meta, X_meta=X_meta,
generate_urls = True if input_dict["range"] == "true" else False, generate_urls=True if input_dict["range"] == "true" else False,
**input_dict) **input_dict)
print data.params print data.params
return {"dataset" : data} return {"dataset": data}
def big_data_apply_classifier(input_dict): def big_data_apply_classifier(input_dict):
if "naivebayes_fitmodel" in input_dict["fitmodel_url"]: if "naivebayes_fitmodel" in input_dict["fitmodel_url"]:
return naivebayes_predict(input_dict) return naivebayes_predict(input_dict)
elif "logreg_fitmodel" in input_dict["fitmodel_url"]: elif "logreg_fitmodel" in input_dict["fitmodel_url"]:
...@@ -60,206 +58,224 @@ def big_data_apply_classifier(input_dict): ...@@ -60,206 +58,224 @@ def big_data_apply_classifier(input_dict):
elif "linreg_fitmodel" in input_dict["fitmodel_url"]: elif "linreg_fitmodel" in input_dict["fitmodel_url"]:
return linreg_predict(input_dict) return linreg_predict(input_dict)
def lwlr_fit_predict(input_dict): def lwlr_fit_predict(input_dict):
from discomll.regression import locally_weighted_linear_regression from discomll.regression import locally_weighted_linear_regression
predictions_url = locally_weighted_linear_regression.fit_predict( predictions_url = locally_weighted_linear_regression.fit_predict(
fitting_data = input_dict["fitting_dataset"], fitting_data=input_dict["fitting_dataset"],
training_data = input_dict["training_dataset"], training_data=input_dict["training_dataset"],
tau = input_dict["tau"], tau=input_dict["tau"],
save_results = True) save_results=True)
return {"string": predictions_url} return {"string": predictions_url}
def dt_fit(input_dict): def dt_fit(input_dict):
from discomll.ensemble import forest_distributed_decision_trees from discomll.ensemble import forest_distributed_decision_trees
random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"]) random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"])
fitmodel_url = forest_distributed_decision_trees.fit(input_dict["dataset"], fitmodel_url = forest_distributed_decision_trees.fit(input_dict["dataset"],
trees_per_chunk = input_dict["trees_per_subset"], trees_per_chunk=input_dict["trees_per_subset"],
max_tree_nodes = input_dict["tree_nodes"], max_tree_nodes=input_dict["tree_nodes"],
min_samples_leaf = input_dict["min_samples_leaf"], min_samples_leaf=input_dict["min_samples_leaf"],
min_samples_split = input_dict["min_samples_split"], min_samples_split=input_dict["min_samples_split"],
class_majority = input_dict["majority"], class_majority=input_dict["majority"],
bootstrap = input_dict["bootstrap"] == "true", bootstrap=input_dict["bootstrap"] == "true",
measure = input_dict["measure"], measure=input_dict["measure"],
accuracy = input_dict["accuracy"], accuracy=input_dict["accuracy"],
separate_max = input_dict["separate_max"], separate_max=input_dict["separate_max"] == "true",
random_state = random_state, random_state=random_state,
save_results = True) save_results=True)
return {"fitmodel_url" : fitmodel_url} return {"fitmodel_url": fitmodel_url}
def dt_predict(input_dict): def dt_predict(input_dict):
from discomll.ensemble import forest_distributed_decision_trees from discomll.ensemble import forest_distributed_decision_trees
predictions_url = forest_distributed_decision_trees.predict(input_dict["dataset"], predictions_url = forest_distributed_decision_trees.predict(input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"], fitmodel_url=input_dict["fitmodel_url"],
save_results = True) save_results=True)
return {"string": predictions_url} return {"string": predictions_url}
def rf_fit(input_dict): def rf_fit(input_dict):
from discomll.ensemble import distributed_random_forest from discomll.ensemble import distributed_random_forest
random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"]) random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"])
fitmodel_url = distributed_random_forest.fit(input_dict["dataset"], fitmodel_url = distributed_random_forest.fit(input_dict["dataset"],
trees_per_chunk = input_dict["trees_per_subset"], trees_per_chunk=input_dict["trees_per_subset"],
max_tree_nodes = input_dict["tree_nodes"], max_tree_nodes=input_dict["tree_nodes"],
min_samples_leaf = input_dict["min_samples_leaf"], min_samples_leaf=input_dict["min_samples_leaf"],
min_samples_split = input_dict["min_samples_split"], min_samples_split=input_dict["min_samples_split"],
class_majority = input_dict["majority"], class_majority=input_dict["majority"],
measure = input_dict["measure"], measure=input_dict["measure"],
accuracy = input_dict["accuracy"], accuracy=input_dict["accuracy"],
separate_max = input_dict["separate_max"], separate_max=input_dict["separate_max"] == "true",
random_state = random_state, random_state=random_state,
save_results = True) save_results=True)
return {"fitmodel_url" : fitmodel_url} return {"fitmodel_url": fitmodel_url}
def rf_predict(input_dict): def rf_predict(input_dict):
from discomll.ensemble import distributed_random_forest from discomll.ensemble import distributed_random_forest
random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"])
predictions_url = distributed_random_forest.predict(input_dict["dataset"], predictions_url = distributed_random_forest.predict(input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"], fitmodel_url=input_dict["fitmodel_url"],
random_state = random_state, save_results=True)
save_results = True)
return {"string": predictions_url} return {"string": predictions_url}
def wrf_fit(input_dict): def wrf_fit(input_dict):
from discomll.ensemble import distributed_weighted_forest_rand from discomll.ensemble import distributed_weighted_forest_rand
random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"]) random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"])
fitmodel_url = distributed_weighted_forest_rand.fit(input_dict["dataset"], fitmodel_url = distributed_weighted_forest_rand.fit(input_dict["dataset"],
trees_per_chunk = input_dict["trees_per_subset"], trees_per_chunk=input_dict["trees_per_subset"],
max_tree_nodes = input_dict["tree_nodes"], max_tree_nodes=input_dict["tree_nodes"],
num_medoids = input_dict["num_medoids"], num_medoids=input_dict["num_medoids"],
min_samples_leaf = input_dict["min_samples_leaf"], min_samples_leaf=input_dict["min_samples_leaf"],
min_samples_split = input_dict["min_samples_split"], min_samples_split=input_dict["min_samples_split"],
class_majority = input_dict["majority"], class_majority=input_dict["majority"],
measure = input_dict["measure"], measure=input_dict["measure"],
accuracy = input_dict["accuracy"], accuracy=input_dict["accuracy"],
separate_max = input_dict["separate_max"], separate_max=input_dict["separate_max"] == "true",
random_state = random_state, random_state=random_state,
save_results = True) save_results=True)
return {"fitmodel_url" : fitmodel_url} return {"fitmodel_url": fitmodel_url}
def wrf_predict(input_dict): def wrf_predict(input_dict):
from discomll.ensemble import distributed_weighted_forest_rand from discomll.ensemble import distributed_weighted_forest_rand
predictions_url = distributed_weighted_forest_rand.predict(input_dict["dataset"], predictions_url = distributed_weighted_forest_rand.predict(input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"], fitmodel_url=input_dict["fitmodel_url"],
coeff = input_dict["coeff"], coeff=input_dict["coeff"],
save_results = True) save_results=True)
return {"string": predictions_url} return {"string": predictions_url}
def linsvm_fit(input_dict): def linsvm_fit(input_dict):
from discomll.classification import linear_svm from discomll.classification import linear_svm
fitmodel_url = linear_svm.fit(input_dict["dataset"], fitmodel_url = linear_svm.fit(input_dict["dataset"],
nu = input_dict["nu"], nu=input_dict["nu"],
save_results = True) save_results=True)
return {"fitmodel_url" : fitmodel_url} return {"fitmodel_url": fitmodel_url}
def linsvm_predict(input_dict): def linsvm_predict(input_dict):
from discomll.classification import linear_svm from discomll.classification import linear_svm
predictions_url = linear_svm.predict(input_dict["dataset"], predictions_url = linear_svm.predict(input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"], fitmodel_url=input_dict["fitmodel_url"],
save_results = True) save_results=True)
return {"string": predictions_url} return {"string": predictions_url}
def linreg_fit(input_dict): def linreg_fit(input_dict):
from discomll.regression import linear_regression from discomll.regression import linear_regression
fitmodel_url = linear_regression.fit(input_dict["dataset"], fitmodel_url = linear_regression.fit(input_dict["dataset"],
save_results = True) save_results=True)
return {"fitmodel_url" : fitmodel_url} return {"fitmodel_url": fitmodel_url}
def linreg_predict(input_dict): def linreg_predict(input_dict):
from discomll.regression import linear_regression from discomll.regression import linear_regression
predictions_url = linear_regression.predict(input_dict["dataset"], predictions_url = linear_regression.predict(input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"], fitmodel_url=input_dict["fitmodel_url"],
save_results = True) save_results=True)
return {"string": predictions_url} return {"string": predictions_url}
def kmeans_fit(input_dict): def kmeans_fit(input_dict):
from discomll.clustering import kmeans from discomll.clustering import kmeans
random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"]) random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"])
fitmodel_url = kmeans.fit(input_dict["dataset"], fitmodel_url = kmeans.fit(input_dict["dataset"],
n_clusters = input_dict["clusters"], n_clusters=input_dict["clusters"],
max_iterations = input_dict["itr"], max_iterations=input_dict["itr"],
random_state = random_state, random_state=random_state,
save_results = True) save_results=True)
return {"fitmodel_url": fitmodel_url}
return {"fitmodel_url" : fitmodel_url}
def kmeans_predict(input_dict): def kmeans_predict(input_dict):
from discomll.clustering import kmeans from discomll.clustering import kmeans
predictions_url = kmeans.predict(input_dict["dataset"], predictions_url = kmeans.predict(input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"], fitmodel_url=input_dict["fitmodel_url"],
save_results = True) save_results=True)
return {"string": predictions_url} return {"string": predictions_url}
def logreg_fit(input_dict): def logreg_fit(input_dict):
from discomll.classification import logistic_regression from discomll.classification import logistic_regression
fitmodel_url = logistic_regression.fit(input_dict["dataset"], fitmodel_url = logistic_regression.fit(input_dict["dataset"],
alpha = input_dict["alpha"], alpha=input_dict["alpha"],
max_iterations = input_dict["itr"], max_iterations=input_dict["itr"],
save_results = True) save_results=True)
return {"fitmodel_url" : fitmodel_url} return {"fitmodel_url": fitmodel_url}
def logreg_predict(input_dict): def logreg_predict(input_dict):
from discomll.classification import logistic_regression from discomll.classification import logistic_regression
predictions_url = logistic_regression.predict(input_dict["dataset"], predictions_url = logistic_regression.predict(input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"], fitmodel_url=input_dict["fitmodel_url"],
save_results = True) save_results=True)
return {"string": predictions_url} return {"string": predictions_url}
def naivebayes_fit(input_dict): def naivebayes_fit(input_dict):
from discomll.classification import naivebayes from discomll.classification import naivebayes
fitmodel_url = naivebayes.fit(input_dict["dataset"], save_results = True) fitmodel_url = naivebayes.fit(input_dict["dataset"], save_results=True)
return {"fitmodel_url" : fitmodel_url} return {"fitmodel_url": fitmodel_url}
def naivebayes_predict(input_dict): def naivebayes_predict(input_dict):
from discomll.classification import naivebayes from discomll.classification import naivebayes
m = 1 if input_dict["m"] == "" else input_dict["m"] m = 1 if input_dict["m"] == "" else input_dict["m"]
predictions_url = naivebayes.predict(input_dict["dataset"], predictions_url = naivebayes.predict(input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"], fitmodel_url=input_dict["fitmodel_url"],
m = input_dict["m"], m=input_dict["m"],
save_results = True ) save_results=True)
return {"string": predictions_url} return {"string": predictions_url}
def results_to_file(input_dict): def results_to_file(input_dict):
#implementation is in visualization_views.py # implementation is in visualization_views.py
return {} return {}
def measure_distribution(input_dict): def measure_distribution(input_dict):
#implementation is in visualization_views.py # implementation is in visualization_views.py
return {} return {}
def model_view(input_dict): def model_view(input_dict):
#implementation is in visualization_views.py # implementation is in visualization_views.py
return {} return {}
def bigdata_ca(input_dict): def bigdata_ca(input_dict):
#implementation is in visualization_views.py # implementation is in visualization_views.py
return {} return {}
def bigdata_mse(input_dict): def bigdata_mse(input_dict):
#implementation is in visualization_views.py # implementation is in visualization_views.py
return {} return {}
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
"category": "1be7b5eb-c1b2-485a-8dbe-56abce63fc73", "category": "1be7b5eb-c1b2-485a-8dbe-56abce63fc73",
"treeview_image": "", "treeview_image": "",
"uid": "4f2ce923-62e6-4be1-a394-72ac52988386", "uid": "4f2ce923-62e6-4be1-a394-72ac52988386",
"is_streaming": false, "windows_queue": false,
"package": "big_data", "package": "big_data",
"interaction_view": "", "interaction_view": "",
"has_progress_bar": false, "has_progress_bar": false,
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
"wsdl_method": "", "wsdl_method": "",
"wsdl": "", "wsdl": "",
"interactive": false, "interactive": false,
"windows_queue": false, "is_streaming": false,
"order": 1, "order": 1,
"name": "Distributed Random Forest" "name": "Distributed Random Forest"
} }
...@@ -171,7 +171,7 @@ ...@@ -171,7 +171,7 @@
"required": true, "required": true,
"multi": false, "multi": false,
"parameter_type": "text", "parameter_type": "text",
"variable": "leaf_min_inst", "variable": "min_samples_split",
"parameter": true, "parameter": true,
"order": 4, "order": 4,
"uid": "a2f366a9-af74-4b3f-90ed-33c1fcad4c3a" "uid": "a2f366a9-af74-4b3f-90ed-33c1fcad4c3a"
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
"category": "1be7b5eb-c1b2-485a-8dbe-56abce63fc73", "category": "1be7b5eb-c1b2-485a-8dbe-56abce63fc73",
"treeview_image": "", "treeview_image": "",
"uid": "72a39fab-5433-493f-ae22-12a264075356", "uid": "72a39fab-5433-493f-ae22-12a264075356",
"is_streaming": false, "windows_queue": false,
"package": "big_data", "package": "big_data",
"interaction_view": "", "interaction_view": "",
"has_progress_bar": false, "has_progress_bar": false,
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
"wsdl_method": "", "wsdl_method": "",
"wsdl": "", "wsdl": "",
"interactive": false, "interactive": false,
"windows_queue": false, "is_streaming": false,
"order": 1, "order": 1,
"name": "Distributed Weighted Forest" "name": "Distributed Weighted Forest"
} }
...@@ -205,7 +205,7 @@ ...@@ -205,7 +205,7 @@
"required": true, "required": true,
"multi": false, "multi": false,
"parameter_type": "text", "parameter_type": "text",
"variable": "leaf_min_inst", "variable": "min_samples_split",
"parameter": true, "parameter": true,
"order": 5, "order": 5,
"uid": "ac032f38-f4a4-44ea-8c02-96506d4f8e86" "uid": "ac032f38-f4a4-44ea-8c02-96506d4f8e86"
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
"category": "08c90036-fd13-4ff3-ac20-01982262a782", "category": "08c90036-fd13-4ff3-ac20-01982262a782",
"treeview_image": "", "treeview_image": "",
"uid": "d9de579e-fdd9-47da-948e-69183f24340f", "uid": "d9de579e-fdd9-47da-948e-69183f24340f",
"is_streaming": false, "windows_queue": false,
"package": "big_data", "package": "big_data",
"interaction_view": "", "interaction_view": "",
"has_progress_bar": false, "has_progress_bar": false,
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
"wsdl_method": "", "wsdl_method": "",
"wsdl": "", "wsdl": "",
"interactive": false, "interactive": false,
"windows_queue": false, "is_streaming": false,
"order": 1, "order": 1,
"name": "Apply Classifier" "name": "Apply Classifier"
} }
...@@ -37,7 +37,7 @@ ...@@ -37,7 +37,7 @@
"parameter_type": null, "parameter_type": null,
"variable": "dataset", "variable": "dataset",
"parameter": false, "parameter": false,
"order": 5, "order": 4,
"uid": "0c174cfd-0d61-44ff-8ce2-9c32f8791293" "uid": "0c174cfd-0d61-44ff-8ce2-9c32f8791293"
} }
}, },
...@@ -58,23 +58,6 @@ ...@@ -58,23 +58,6 @@
"uid": "46e9a02d-a613-4a01-a8ee-51d578a32fea" "uid": "46e9a02d-a613-4a01-a8ee-51d578a32fea"
} }
}, },
{
"model": "workflows.abstractinput",
"fields": {
"widget": "d9de579e-fdd9-47da-948e-69183f24340f",
"name": "Random forest - random state",
"short_name": "rds",
"default": "None",
"description": "Define a random state for predict phase.",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "seed",
"parameter": true,
"order": 3,
"uid": "8ec2b906-2b9e-4cda-9455-09ccd7d134fb"
}
},
{ {
"model": "workflows.abstractinput", "model": "workflows.abstractinput",
"fields": { "fields": {
...@@ -96,7 +79,7 @@ ...@@ -96,7 +79,7 @@
"model": "workflows.abstractinput", "model": "workflows.abstractinput",
"fields": { "fields": {
"widget": "d9de579e-fdd9-47da-948e-69183f24340f", "widget": "d9de579e-fdd9-47da-948e-69183f24340f",
"name": "Weighted forest - similarity coeff", "name": "Distributed Weighted forest - similarity coeff",
"short_name": "coe", "short_name": "coe",
"default": "0.5", "default": "0.5",
"description": "Percentage of most similar treees to include in prediction (0 - 1)", "description": "Percentage of most similar treees to include in prediction (0 - 1)",
...@@ -105,7 +88,7 @@ ...@@ -105,7 +88,7 @@
"parameter_type": "text", "parameter_type": "text",
"variable": "coeff", "variable": "coeff",
"parameter": true, "parameter": true,
"order": 4, "order": 3,
"uid": "d979ff23-eb11-40cf-9d81-2a71ddf5d790"