Commit 3e71fbd1 authored by Janez's avatar Janez

Merge branch 'totrtale_multiproc' of /home/git/repositories/kt/mothra

parents 98cd3217 b0d8c8d9
...@@ -36,6 +36,8 @@ def file_url(input_dict): ...@@ -36,6 +36,8 @@ def file_url(input_dict):
X_meta = X_meta, X_meta = X_meta,
generate_urls = True if input_dict["range"] == "true" else False, generate_urls = True if input_dict["range"] == "true" else False,
**input_dict) **input_dict)
print data.params
return {"dataset" : data} return {"dataset" : data}
...@@ -49,11 +51,11 @@ def big_data_apply_classifier(input_dict): ...@@ -49,11 +51,11 @@ def big_data_apply_classifier(input_dict):
return linsvm_predict(input_dict) return linsvm_predict(input_dict)
elif "kmeans_fitmodel" in input_dict["fitmodel_url"]: elif "kmeans_fitmodel" in input_dict["fitmodel_url"]:
return kmeans_predict(input_dict) return kmeans_predict(input_dict)
elif "dt_fitmodel" in input_dict["fitmodel_url"]: elif "fddt_fitmodel" in input_dict["fitmodel_url"]:
return dt_predict(input_dict) return dt_predict(input_dict)
elif "rf_fitmodel" in input_dict["fitmodel_url"]: elif "drf_fitmodel" in input_dict["fitmodel_url"]:
return rf_predict(input_dict) return rf_predict(input_dict)
elif "wrf_fitmodel" in input_dict["fitmodel_url"]: elif "dwfr_fitmodel" in input_dict["fitmodel_url"]:
return wrf_predict(input_dict) return wrf_predict(input_dict)
elif "linreg_fitmodel" in input_dict["fitmodel_url"]: elif "linreg_fitmodel" in input_dict["fitmodel_url"]:
return linreg_predict(input_dict) return linreg_predict(input_dict)
...@@ -72,14 +74,14 @@ def dt_fit(input_dict): ...@@ -72,14 +74,14 @@ def dt_fit(input_dict):
from discomll.ensemble import forest_distributed_decision_trees from discomll.ensemble import forest_distributed_decision_trees
random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"]) random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"])
bootstrap = input_dict["bootstrap"] == "true"
fitmodel_url = forest_distributed_decision_trees.fit(input = input_dict["dataset"], fitmodel_url = forest_distributed_decision_trees.fit(input = input_dict["dataset"],
trees_per_chunk = input_dict["trees_per_subset"],
max_tree_nodes = input_dict["tree_nodes"], max_tree_nodes = input_dict["tree_nodes"],
min_samples_leaf = input_dict["min_samples_leaf"], min_samples_leaf = input_dict["min_samples_leaf"],
min_samples_split = input_dict["min_samples_split"], min_samples_split = input_dict["min_samples_split"],
class_majority = input_dict["majority"], class_majority = input_dict["majority"],
bootstrap = bootstrap, bootstrap = input_dict["bootstrap"] == "true",
measure = input_dict["measure"], measure = input_dict["measure"],
accuracy = input_dict["accuracy"], accuracy = input_dict["accuracy"],
separate_max = input_dict["separate_max"], separate_max = input_dict["separate_max"],
...@@ -102,11 +104,13 @@ def rf_fit(input_dict): ...@@ -102,11 +104,13 @@ def rf_fit(input_dict):
fitmodel_url = distributed_random_forest.fit(input = input_dict["dataset"], fitmodel_url = distributed_random_forest.fit(input = input_dict["dataset"],
trees_per_chunk = input_dict["trees_per_subset"], trees_per_chunk = input_dict["trees_per_subset"],
max_tree_nodes = input_dict["tree_nodes"], max_tree_nodes = input_dict["tree_nodes"],
leaf_min_inst = input_dict["leaf_min_inst"], min_samples_leaf = input_dict["min_samples_leaf"],
class_majority = input_dict["majority"], min_samples_split = input_dict["min_samples_split"],
class_majority = input_dict["majority"],
measure = input_dict["measure"], measure = input_dict["measure"],
split_fun = input_dict["split_fun"], accuracy = input_dict["accuracy"],
separate_max = input_dict["separate_max"],
random_state = random_state, random_state = random_state,
save_results = True) save_results = True)
...@@ -119,32 +123,35 @@ def rf_predict(input_dict): ...@@ -119,32 +123,35 @@ def rf_predict(input_dict):
predictions_url = distributed_random_forest.predict(input = input_dict["dataset"], predictions_url = distributed_random_forest.predict(input = input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"], fitmodel_url = input_dict["fitmodel_url"],
diff = input_dict["diff"],
random_state = random_state, random_state = random_state,
save_results = True) save_results = True)
return {"string": predictions_url} return {"string": predictions_url}
def wrf_fit(input_dict): def wrf_fit(input_dict):
from discomll.ensemble import distributed_weighted_forest from discomll.ensemble import distributed_weighted_forest_rand
random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"]) random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"])
fitmodel_url = distributed_weighted_forest.fit(input = input_dict["dataset"], fitmodel_url = distributed_weighted_forest_rand.fit(input = input_dict["dataset"],
trees_per_chunk = input_dict["trees_per_subset"], trees_per_chunk = input_dict["trees_per_subset"],
max_tree_nodes = input_dict["tree_nodes"], max_tree_nodes = input_dict["tree_nodes"],
leaf_min_inst = input_dict["leaf_min_inst"], num_medoids = input_dict["num_medoids"],
class_majority = input_dict["majority"], min_samples_leaf = input_dict["min_samples_leaf"],
min_samples_split = input_dict["min_samples_split"],
class_majority = input_dict["majority"],
measure = input_dict["measure"], measure = input_dict["measure"],
split_fun = input_dict["split_fun"], accuracy = input_dict["accuracy"],
save_results = True, separate_max = input_dict["separate_max"],
random_state = random_state) random_state = random_state,
save_results = True)
return {"fitmodel_url" : fitmodel_url} return {"fitmodel_url" : fitmodel_url}
def wrf_predict(input_dict): def wrf_predict(input_dict):
from discomll.ensemble import distributed_weighted_forest from discomll.ensemble import distributed_weighted_forest_rand
predictions_url = distributed_weighted_forest.predict(input = input_dict["dataset"], predictions_url = distributed_weighted_forest_rand.predict(input = input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"], fitmodel_url = input_dict["fitmodel_url"],
coeff = input_dict["coeff"],
save_results = True) save_results = True)
return {"string": predictions_url} return {"string": predictions_url}
......
...@@ -28,26 +28,60 @@ ...@@ -28,26 +28,60 @@
"model": "workflows.abstractinput", "model": "workflows.abstractinput",
"fields": { "fields": {
"widget": "4f2ce923-62e6-4be1-a394-72ac52988386", "widget": "4f2ce923-62e6-4be1-a394-72ac52988386",
"name": "Discretization", "name": "Discretization accuracy",
"short_name": "spf", "short_name": "dac",
"default": "equal_freq", "default": "1",
"description": "Select equal frequency discretization or random discretization for numeric attributes", "description": "Continuous attributes are converted to discrete intervals. For exact estimation use 0 (slowest) or increase the number to get an approximation (faster).",
"required": true, "required": true,
"multi": false, "multi": false,
"parameter_type": "select", "parameter_type": "text",
"variable": "split_fun", "variable": "accuracy",
"parameter": true, "parameter": true,
"order": 7, "order": 8,
"uid": "00758cdf-2eb5-43c5-bedf-bd3b8b9c29d6" "uid": "00758cdf-2eb5-43c5-bedf-bd3b8b9c29d6"
} }
}, },
{
"model": "workflows.abstractinput",
"fields": {
"widget": "4f2ce923-62e6-4be1-a394-72ac52988386",
"name": "Separate most represented class",
"short_name": "smp",
"default": "true",
"description": "separate_max",
"required": true,
"multi": false,
"parameter_type": "checkbox",
"variable": "separate_max",
"parameter": true,
"order": 9,
"uid": "21444978-142f-4f3d-947c-20e0b41a2c9b"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "4f2ce923-62e6-4be1-a394-72ac52988386",
"name": "Min samples in leaf",
"short_name": "msl",
"default": "5",
"description": "The minimum number of samples in newly created leaves. A split is discarded if after the split, one of the leaves would contain less then min samples leaf samples",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "min_samples_leaf",
"parameter": true,
"order": 5,
"uid": "52591706-7f30-4def-a788-3e07d3f82876"
}
},
{ {
"model": "workflows.abstractinput", "model": "workflows.abstractinput",
"fields": { "fields": {
"widget": "4f2ce923-62e6-4be1-a394-72ac52988386", "widget": "4f2ce923-62e6-4be1-a394-72ac52988386",
"name": "Max tree nodes", "name": "Max tree nodes",
"short_name": "mnt", "short_name": "mnt",
"default": "20", "default": "100",
"description": "Max. number of decision tree nodes", "description": "Max. number of decision tree nodes",
"required": true, "required": true,
"multi": false, "multi": false,
...@@ -88,7 +122,7 @@ ...@@ -88,7 +122,7 @@
"parameter_type": "select", "parameter_type": "select",
"variable": "measure", "variable": "measure",
"parameter": true, "parameter": true,
"order": 6, "order": 7,
"uid": "68cbccf9-7469-4b55-b96e-4f7c6a3c9cde" "uid": "68cbccf9-7469-4b55-b96e-4f7c6a3c9cde"
} }
}, },
...@@ -105,7 +139,7 @@ ...@@ -105,7 +139,7 @@
"parameter_type": "text", "parameter_type": "text",
"variable": "seed", "variable": "seed",
"parameter": true, "parameter": true,
"order": 8, "order": 10,
"uid": "8e6e2d96-3457-4b23-ac93-ab90b083920f" "uid": "8e6e2d96-3457-4b23-ac93-ab90b083920f"
} }
}, },
...@@ -132,7 +166,7 @@ ...@@ -132,7 +166,7 @@
"widget": "4f2ce923-62e6-4be1-a394-72ac52988386", "widget": "4f2ce923-62e6-4be1-a394-72ac52988386",
"name": "Min samples split", "name": "Min samples split",
"short_name": "lmi", "short_name": "lmi",
"default": "5", "default": "10",
"description": "Min. number of samples to split the node", "description": "Min. number of samples to split the node",
"required": true, "required": true,
"multi": false, "multi": false,
...@@ -156,7 +190,7 @@ ...@@ -156,7 +190,7 @@
"parameter_type": "text", "parameter_type": "text",
"variable": "majority", "variable": "majority",
"parameter": true, "parameter": true,
"order": 5, "order": 6,
"uid": "fe7f5d5a-c2e2-4ae9-b138-18b1de7c4e93" "uid": "fe7f5d5a-c2e2-4ae9-b138-18b1de7c4e93"
} }
}, },
......
...@@ -37,10 +37,44 @@ ...@@ -37,10 +37,44 @@
"parameter_type": "text", "parameter_type": "text",
"variable": "majority", "variable": "majority",
"parameter": true, "parameter": true,
"order": 5, "order": 7,
"uid": "1b23ead1-b104-4d27-a6fd-b23de6efa28f" "uid": "1b23ead1-b104-4d27-a6fd-b23de6efa28f"
} }
}, },
{
"model": "workflows.abstractinput",
"fields": {
"widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Randomly chosen medoids",
"short_name": "rcm",
"default": "3",
"description": "Number of randomly chosen medoids to calculate similaty.",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "num_medoids",
"parameter": true,
"order": 4,
"uid": "1bbcbc2c-a9d5-4427-a8ef-e4dd58c22f86"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Separate most represented class",
"short_name": "smp",
"default": "true",
"description": "",
"required": true,
"multi": false,
"parameter_type": "checkbox",
"variable": "separate_max",
"parameter": true,
"order": 10,
"uid": "2ccff5c1-7e06-4887-863d-7acf76209e50"
}
},
{ {
"model": "workflows.abstractinput", "model": "workflows.abstractinput",
"fields": { "fields": {
...@@ -54,7 +88,7 @@ ...@@ -54,7 +88,7 @@
"parameter_type": "text", "parameter_type": "text",
"variable": "seed", "variable": "seed",
"parameter": true, "parameter": true,
"order": 8, "order": 11,
"uid": "31c68e34-3bff-41bb-bf77-925c6171a6f6" "uid": "31c68e34-3bff-41bb-bf77-925c6171a6f6"
} }
}, },
...@@ -75,13 +109,30 @@ ...@@ -75,13 +109,30 @@
"uid": "37879268-0aa9-4458-afb2-71a521acb299" "uid": "37879268-0aa9-4458-afb2-71a521acb299"
} }
}, },
{
"model": "workflows.abstractinput",
"fields": {
"widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Min samples in leaf",
"short_name": "msl",
"default": "5",
"description": "The minimum number of samples in newly created leaves. A split is discarded if after the split, one of the leaves would contain less then min samples leaf samples",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "min_samples_leaf",
"parameter": true,
"order": 6,
"uid": "3a893a69-f22e-448b-9a92-222573c655ba"
}
},
{ {
"model": "workflows.abstractinput", "model": "workflows.abstractinput",
"fields": { "fields": {
"widget": "72a39fab-5433-493f-ae22-12a264075356", "widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Max tree nodes", "name": "Max tree nodes",
"short_name": "mnt", "short_name": "mnt",
"default": "20", "default": "100",
"description": "Max. number of decision tree nodes", "description": "Max. number of decision tree nodes",
"required": true, "required": true,
"multi": false, "multi": false,
...@@ -92,6 +143,23 @@ ...@@ -92,6 +143,23 @@
"uid": "3d48b0d0-a304-45d5-9d18-3ca17e8fcf05" "uid": "3d48b0d0-a304-45d5-9d18-3ca17e8fcf05"
} }
}, },
{
"model": "workflows.abstractinput",
"fields": {
"widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Discretization accuracy",
"short_name": "dac",
"default": "1",
"description": "Continuous attributes are converted to discrete intervals. For exact estimation use 0 (slowest) or increase the number to get an approximation (faster).",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "accuracy",
"parameter": true,
"order": 9,
"uid": "3ff0f040-3d11-413f-975a-1fde57bf289b"
}
},
{ {
"model": "workflows.abstractinput", "model": "workflows.abstractinput",
"fields": { "fields": {
...@@ -122,7 +190,7 @@ ...@@ -122,7 +190,7 @@
"parameter_type": "select", "parameter_type": "select",
"variable": "measure", "variable": "measure",
"parameter": true, "parameter": true,
"order": 6, "order": 8,
"uid": "9a8f3c2c-265c-4b37-93c1-d58fee9dd7af" "uid": "9a8f3c2c-265c-4b37-93c1-d58fee9dd7af"
} }
}, },
...@@ -132,34 +200,17 @@ ...@@ -132,34 +200,17 @@
"widget": "72a39fab-5433-493f-ae22-12a264075356", "widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Min samples split", "name": "Min samples split",
"short_name": "lmi", "short_name": "lmi",
"default": "5", "default": "10",
"description": "Min. number of samples to split the node", "description": "Min. number of samples to split the node",
"required": true, "required": true,
"multi": false, "multi": false,
"parameter_type": "text", "parameter_type": "text",
"variable": "leaf_min_inst", "variable": "leaf_min_inst",
"parameter": true, "parameter": true,
"order": 4, "order": 5,
"uid": "ac032f38-f4a4-44ea-8c02-96506d4f8e86" "uid": "ac032f38-f4a4-44ea-8c02-96506d4f8e86"
} }
}, },
{
"model": "workflows.abstractinput",
"fields": {
"widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Discretization",
"short_name": "spf",
"default": "equal_freq",
"description": "Select equal frequency discretization or random discretization for numeric attributes",
"required": true,
"multi": false,
"parameter_type": "select",
"variable": "split_fun",
"parameter": true,
"order": 7,
"uid": "c43bfc92-e1af-42fc-8d73-f7348ebdaf40"
}
},
{ {
"model": "workflows.abstractoutput", "model": "workflows.abstractoutput",
"fields": { "fields": {
...@@ -189,23 +240,5 @@ ...@@ -189,23 +240,5 @@
"value": "info_gain", "value": "info_gain",
"abstract_input": "9a8f3c2c-265c-4b37-93c1-d58fee9dd7af" "abstract_input": "9a8f3c2c-265c-4b37-93c1-d58fee9dd7af"
} }
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "Equal frequency discretization",
"uid": "4ea5c55a-92a8-4541-a1cc-9aabb0fd82c0",
"value": "equal_freq",
"abstract_input": "c43bfc92-e1af-42fc-8d73-f7348ebdaf40"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "Random discretization",
"uid": "838f798d-e00e-4216-8990-ebc3c1929c0e",
"value": "random",
"abstract_input": "c43bfc92-e1af-42fc-8d73-f7348ebdaf40"
}
} }
] ]
\ No newline at end of file
...@@ -71,7 +71,7 @@ ...@@ -71,7 +71,7 @@
"parameter_type": "select", "parameter_type": "select",
"variable": "measure", "variable": "measure",
"parameter": true, "parameter": true,
"order": 7, "order": 8,
"uid": "28f53666-76b0-4d44-acab-0824e603a848" "uid": "28f53666-76b0-4d44-acab-0824e603a848"
} }
}, },
...@@ -88,7 +88,7 @@ ...@@ -88,7 +88,7 @@
"parameter_type": "text", "parameter_type": "text",
"variable": "seed", "variable": "seed",
"parameter": true, "parameter": true,
"order": 10, "order": 11,
"uid": "40bc0e36-427f-4517-ac56-55ef033a0e9c" "uid": "40bc0e36-427f-4517-ac56-55ef033a0e9c"
} }
}, },
...@@ -105,7 +105,7 @@ ...@@ -105,7 +105,7 @@
"parameter_type": "text", "parameter_type": "text",
"variable": "accuracy", "variable": "accuracy",
"parameter": true, "parameter": true,
"order": 8, "order": 9,
"uid": "45a0c36c-d61a-4708-b54a-6908494ee090" "uid": "45a0c36c-d61a-4708-b54a-6908494ee090"
} }
}, },
...@@ -122,7 +122,7 @@ ...@@ -122,7 +122,7 @@
"parameter_type": "text", "parameter_type": "text",
"variable": "min_samples_leaf", "variable": "min_samples_leaf",
"parameter": true, "parameter": true,
"order": 5, "order": 6,
"uid": "739e0f16-2ac9-423e-8050-58778553ca48" "uid": "739e0f16-2ac9-423e-8050-58778553ca48"
} }
}, },
...@@ -181,7 +181,7 @@ ...@@ -181,7 +181,7 @@
"model": "workflows.abstractinput", "model": "workflows.abstractinput",
"fields": { "fields": {
"widget": "74df0d6e-684f-46ae-975d-ba1ce5425066", "widget": "74df0d6e-684f-46ae-975d-ba1ce5425066",
"name": "Separate most present class", "name": "Separate most represented class",
"short_name": "smp", "short_name": "smp",
"default": "true", "default": "true",
"description": "", "description": "",
...@@ -190,7 +190,7 @@ ...@@ -190,7 +190,7 @@
"parameter_type": "checkbox", "parameter_type": "checkbox",
"variable": "separate_max", "variable": "separate_max",
"parameter": true, "parameter": true,
"order": 9, "order": 10,
"uid": "eac59b1f-c35d-4116-b4d7-9320d2b4b351" "uid": "eac59b1f-c35d-4116-b4d7-9320d2b4b351"
} }
}, },
...@@ -207,7 +207,7 @@ ...@@ -207,7 +207,7 @@
"parameter_type": "text", "parameter_type": "text",
"variable": "majority", "variable": "majority",
"parameter": true, "parameter": true,
"order": 6, "order": 7,
"uid": "f31f0f86-238b-4ce1-b7e7-1ad6e88f55b0" "uid": "f31f0f86-238b-4ce1-b7e7-1ad6e88f55b0"
} }
}, },
......
...@@ -71,7 +71,7 @@ ...@@ -71,7 +71,7 @@
"parameter_type": "text", "parameter_type": "text",
"variable": "seed", "variable": "seed",
"parameter": true, "parameter": true,
"order": 4, "order": 3,
"uid": "8ec2b906-2b9e-4cda-9455-09ccd7d134fb" "uid": "8ec2b906-2b9e-4cda-9455-09ccd7d134fb"
} }
}, },
...@@ -96,16 +96,16 @@ ...@@ -96,16 +96,16 @@
"model": "workflows.abstractinput", "model": "workflows.abstractinput",
"fields": { "fields": {
"widget": "d9de579e-fdd9-47da-948e-69183f24340f", "widget": "d9de579e-fdd9-47da-948e-69183f24340f",
"name": "Random forest - difference", "name": "Weighted forest - similarity coeff",
"short_name": "dff", "short_name": "coe",
"default": "0.3", "default": "0.5",
"description": "Random forest calculates difference in probability between most and second most probable prediction. If difference is greater than parameter diff, it outputs prediction. If a test sample is hard to predict (difference is never higher than diff), it queries whole ensemble to make a prediction.", "description": "Percentage of most similar treees to include in prediction (0 - 1)",
"required": true, "required": true,
"multi": false, "multi": false,
"parameter_type": "text", "parameter_type": "text",
"variable": "diff", "variable": "coeff",