Commit b0d8c8d9 authored by hiphop's avatar hiphop

nlp stop words, big_data algorithm changes

parent 34b5f041
......@@ -36,6 +36,8 @@ def file_url(input_dict):
X_meta = X_meta,
generate_urls = True if input_dict["range"] == "true" else False,
**input_dict)
print data.params
return {"dataset" : data}
......@@ -49,11 +51,11 @@ def big_data_apply_classifier(input_dict):
return linsvm_predict(input_dict)
elif "kmeans_fitmodel" in input_dict["fitmodel_url"]:
return kmeans_predict(input_dict)
elif "dt_fitmodel" in input_dict["fitmodel_url"]:
elif "fddt_fitmodel" in input_dict["fitmodel_url"]:
return dt_predict(input_dict)
elif "rf_fitmodel" in input_dict["fitmodel_url"]:
elif "drf_fitmodel" in input_dict["fitmodel_url"]:
return rf_predict(input_dict)
elif "wrf_fitmodel" in input_dict["fitmodel_url"]:
elif "dwfr_fitmodel" in input_dict["fitmodel_url"]:
return wrf_predict(input_dict)
elif "linreg_fitmodel" in input_dict["fitmodel_url"]:
return linreg_predict(input_dict)
......@@ -72,14 +74,14 @@ def dt_fit(input_dict):
from discomll.ensemble import forest_distributed_decision_trees
random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"])
bootstrap = input_dict["bootstrap"] == "true"
fitmodel_url = forest_distributed_decision_trees.fit(input = input_dict["dataset"],
trees_per_chunk = input_dict["trees_per_subset"],
max_tree_nodes = input_dict["tree_nodes"],
min_samples_leaf = input_dict["min_samples_leaf"],
min_samples_split = input_dict["min_samples_split"],
class_majority = input_dict["majority"],
bootstrap = bootstrap,
bootstrap = input_dict["bootstrap"] == "true",
measure = input_dict["measure"],
accuracy = input_dict["accuracy"],
separate_max = input_dict["separate_max"],
......@@ -102,11 +104,13 @@ def rf_fit(input_dict):
fitmodel_url = distributed_random_forest.fit(input = input_dict["dataset"],
trees_per_chunk = input_dict["trees_per_subset"],
max_tree_nodes = input_dict["tree_nodes"],
leaf_min_inst = input_dict["leaf_min_inst"],
class_majority = input_dict["majority"],
max_tree_nodes = input_dict["tree_nodes"],
min_samples_leaf = input_dict["min_samples_leaf"],
min_samples_split = input_dict["min_samples_split"],
class_majority = input_dict["majority"],
measure = input_dict["measure"],
split_fun = input_dict["split_fun"],
accuracy = input_dict["accuracy"],
separate_max = input_dict["separate_max"],
random_state = random_state,
save_results = True)
......@@ -119,32 +123,35 @@ def rf_predict(input_dict):
predictions_url = distributed_random_forest.predict(input = input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"],
diff = input_dict["diff"],
random_state = random_state,
save_results = True)
return {"string": predictions_url}
def wrf_fit(input_dict):
from discomll.ensemble import distributed_weighted_forest
from discomll.ensemble import distributed_weighted_forest_rand
random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"])
fitmodel_url = distributed_weighted_forest.fit(input = input_dict["dataset"],
fitmodel_url = distributed_weighted_forest_rand.fit(input = input_dict["dataset"],
trees_per_chunk = input_dict["trees_per_subset"],
max_tree_nodes = input_dict["tree_nodes"],
leaf_min_inst = input_dict["leaf_min_inst"],
class_majority = input_dict["majority"],
max_tree_nodes = input_dict["tree_nodes"],
num_medoids = input_dict["num_medoids"],
min_samples_leaf = input_dict["min_samples_leaf"],
min_samples_split = input_dict["min_samples_split"],
class_majority = input_dict["majority"],
measure = input_dict["measure"],
split_fun = input_dict["split_fun"],
save_results = True,
random_state = random_state)
accuracy = input_dict["accuracy"],
separate_max = input_dict["separate_max"],
random_state = random_state,
save_results = True)
return {"fitmodel_url" : fitmodel_url}
def wrf_predict(input_dict):
from discomll.ensemble import distributed_weighted_forest
from discomll.ensemble import distributed_weighted_forest_rand
predictions_url = distributed_weighted_forest.predict(input = input_dict["dataset"],
predictions_url = distributed_weighted_forest_rand.predict(input = input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"],
coeff = input_dict["coeff"],
save_results = True)
return {"string": predictions_url}
......
......@@ -28,26 +28,60 @@
"model": "workflows.abstractinput",
"fields": {
"widget": "4f2ce923-62e6-4be1-a394-72ac52988386",
"name": "Discretization",
"short_name": "spf",
"default": "equal_freq",
"description": "Select equal frequency discretization or random discretization for numeric attributes",
"name": "Discretization accuracy",
"short_name": "dac",
"default": "1",
"description": "Continuous attributes are converted to discrete intervals. For exact estimation use 0 (slowest) or increase the number to get an approximation (faster).",
"required": true,
"multi": false,
"parameter_type": "select",
"variable": "split_fun",
"parameter_type": "text",
"variable": "accuracy",
"parameter": true,
"order": 7,
"order": 8,
"uid": "00758cdf-2eb5-43c5-bedf-bd3b8b9c29d6"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "4f2ce923-62e6-4be1-a394-72ac52988386",
"name": "Separate most represented class",
"short_name": "smp",
"default": "true",
"description": "separate_max",
"required": true,
"multi": false,
"parameter_type": "checkbox",
"variable": "separate_max",
"parameter": true,
"order": 9,
"uid": "21444978-142f-4f3d-947c-20e0b41a2c9b"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "4f2ce923-62e6-4be1-a394-72ac52988386",
"name": "Min samples in leaf",
"short_name": "msl",
"default": "5",
"description": "The minimum number of samples in newly created leaves. A split is discarded if after the split, one of the leaves would contain less then min samples leaf samples",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "min_samples_leaf",
"parameter": true,
"order": 5,
"uid": "52591706-7f30-4def-a788-3e07d3f82876"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "4f2ce923-62e6-4be1-a394-72ac52988386",
"name": "Max tree nodes",
"short_name": "mnt",
"default": "20",
"default": "100",
"description": "Max. number of decision tree nodes",
"required": true,
"multi": false,
......@@ -88,7 +122,7 @@
"parameter_type": "select",
"variable": "measure",
"parameter": true,
"order": 6,
"order": 7,
"uid": "68cbccf9-7469-4b55-b96e-4f7c6a3c9cde"
}
},
......@@ -105,7 +139,7 @@
"parameter_type": "text",
"variable": "seed",
"parameter": true,
"order": 8,
"order": 10,
"uid": "8e6e2d96-3457-4b23-ac93-ab90b083920f"
}
},
......@@ -132,7 +166,7 @@
"widget": "4f2ce923-62e6-4be1-a394-72ac52988386",
"name": "Min samples split",
"short_name": "lmi",
"default": "5",
"default": "10",
"description": "Min. number of samples to split the node",
"required": true,
"multi": false,
......@@ -156,7 +190,7 @@
"parameter_type": "text",
"variable": "majority",
"parameter": true,
"order": 5,
"order": 6,
"uid": "fe7f5d5a-c2e2-4ae9-b138-18b1de7c4e93"
}
},
......
......@@ -37,10 +37,44 @@
"parameter_type": "text",
"variable": "majority",
"parameter": true,
"order": 5,
"order": 7,
"uid": "1b23ead1-b104-4d27-a6fd-b23de6efa28f"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Randomly chosen medoids",
"short_name": "rcm",
"default": "3",
"description": "Number of randomly chosen medoids to calculate similaty.",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "num_medoids",
"parameter": true,
"order": 4,
"uid": "1bbcbc2c-a9d5-4427-a8ef-e4dd58c22f86"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Separate most represented class",
"short_name": "smp",
"default": "true",
"description": "",
"required": true,
"multi": false,
"parameter_type": "checkbox",
"variable": "separate_max",
"parameter": true,
"order": 10,
"uid": "2ccff5c1-7e06-4887-863d-7acf76209e50"
}
},
{
"model": "workflows.abstractinput",
"fields": {
......@@ -54,7 +88,7 @@
"parameter_type": "text",
"variable": "seed",
"parameter": true,
"order": 8,
"order": 11,
"uid": "31c68e34-3bff-41bb-bf77-925c6171a6f6"
}
},
......@@ -75,13 +109,30 @@
"uid": "37879268-0aa9-4458-afb2-71a521acb299"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Min samples in leaf",
"short_name": "msl",
"default": "5",
"description": "The minimum number of samples in newly created leaves. A split is discarded if after the split, one of the leaves would contain less then min samples leaf samples",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "min_samples_leaf",
"parameter": true,
"order": 6,
"uid": "3a893a69-f22e-448b-9a92-222573c655ba"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Max tree nodes",
"short_name": "mnt",
"default": "20",
"default": "100",
"description": "Max. number of decision tree nodes",
"required": true,
"multi": false,
......@@ -92,6 +143,23 @@
"uid": "3d48b0d0-a304-45d5-9d18-3ca17e8fcf05"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Discretization accuracy",
"short_name": "dac",
"default": "1",
"description": "Continuous attributes are converted to discrete intervals. For exact estimation use 0 (slowest) or increase the number to get an approximation (faster).",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "accuracy",
"parameter": true,
"order": 9,
"uid": "3ff0f040-3d11-413f-975a-1fde57bf289b"
}
},
{
"model": "workflows.abstractinput",
"fields": {
......@@ -122,7 +190,7 @@
"parameter_type": "select",
"variable": "measure",
"parameter": true,
"order": 6,
"order": 8,
"uid": "9a8f3c2c-265c-4b37-93c1-d58fee9dd7af"
}
},
......@@ -132,34 +200,17 @@
"widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Min samples split",
"short_name": "lmi",
"default": "5",
"default": "10",
"description": "Min. number of samples to split the node",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "leaf_min_inst",
"parameter": true,
"order": 4,
"order": 5,
"uid": "ac032f38-f4a4-44ea-8c02-96506d4f8e86"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Discretization",
"short_name": "spf",
"default": "equal_freq",
"description": "Select equal frequency discretization or random discretization for numeric attributes",
"required": true,
"multi": false,
"parameter_type": "select",
"variable": "split_fun",
"parameter": true,
"order": 7,
"uid": "c43bfc92-e1af-42fc-8d73-f7348ebdaf40"
}
},
{
"model": "workflows.abstractoutput",
"fields": {
......@@ -189,23 +240,5 @@
"value": "info_gain",
"abstract_input": "9a8f3c2c-265c-4b37-93c1-d58fee9dd7af"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "Equal frequency discretization",
"uid": "4ea5c55a-92a8-4541-a1cc-9aabb0fd82c0",
"value": "equal_freq",
"abstract_input": "c43bfc92-e1af-42fc-8d73-f7348ebdaf40"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "Random discretization",
"uid": "838f798d-e00e-4216-8990-ebc3c1929c0e",
"value": "random",
"abstract_input": "c43bfc92-e1af-42fc-8d73-f7348ebdaf40"
}
}
]
\ No newline at end of file
......@@ -71,7 +71,7 @@
"parameter_type": "select",
"variable": "measure",
"parameter": true,
"order": 7,
"order": 8,
"uid": "28f53666-76b0-4d44-acab-0824e603a848"
}
},
......@@ -88,7 +88,7 @@
"parameter_type": "text",
"variable": "seed",
"parameter": true,
"order": 10,
"order": 11,
"uid": "40bc0e36-427f-4517-ac56-55ef033a0e9c"
}
},
......@@ -105,7 +105,7 @@
"parameter_type": "text",
"variable": "accuracy",
"parameter": true,
"order": 8,
"order": 9,
"uid": "45a0c36c-d61a-4708-b54a-6908494ee090"
}
},
......@@ -122,7 +122,7 @@
"parameter_type": "text",
"variable": "min_samples_leaf",
"parameter": true,
"order": 5,
"order": 6,
"uid": "739e0f16-2ac9-423e-8050-58778553ca48"
}
},
......@@ -181,7 +181,7 @@
"model": "workflows.abstractinput",
"fields": {
"widget": "74df0d6e-684f-46ae-975d-ba1ce5425066",
"name": "Separate most present class",
"name": "Separate most represented class",
"short_name": "smp",
"default": "true",
"description": "",
......@@ -190,7 +190,7 @@
"parameter_type": "checkbox",
"variable": "separate_max",
"parameter": true,
"order": 9,
"order": 10,
"uid": "eac59b1f-c35d-4116-b4d7-9320d2b4b351"
}
},
......@@ -207,7 +207,7 @@
"parameter_type": "text",
"variable": "majority",
"parameter": true,
"order": 6,
"order": 7,
"uid": "f31f0f86-238b-4ce1-b7e7-1ad6e88f55b0"
}
},
......
......@@ -71,7 +71,7 @@
"parameter_type": "text",
"variable": "seed",
"parameter": true,
"order": 4,
"order": 3,
"uid": "8ec2b906-2b9e-4cda-9455-09ccd7d134fb"
}
},
......@@ -96,16 +96,16 @@
"model": "workflows.abstractinput",
"fields": {
"widget": "d9de579e-fdd9-47da-948e-69183f24340f",
"name": "Random forest - difference",
"short_name": "dff",
"default": "0.3",
"description": "Random forest calculates difference in probability between most and second most probable prediction. If difference is greater than parameter diff, it outputs prediction. If a test sample is hard to predict (difference is never higher than diff), it queries whole ensemble to make a prediction.",
"name": "Weighted forest - similarity coeff",
"short_name": "coe",
"default": "0.5",
"description": "Percentage of most similar treees to include in prediction (0 - 1)",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "diff",
"variable": "coeff",
"parameter": true,
"order": 3,
"order": 4,
"uid": "d979ff23-eb11-40cf-9d81-2a71ddf5d790"
}
},
......
......@@ -15,7 +15,6 @@ def measure_distribution(request,input_dict,output_dict,widget):
def bigdata_ca(request,input_dict,output_dict,widget):
from discomll.utils import accuracy
from disco.core import result_iterator
import os.path
from mothra.settings import MEDIA_ROOT
from workflows.helpers import ensure_dir
......@@ -32,7 +31,7 @@ def bigdata_ca(request,input_dict,output_dict,widget):
predictions = input_dict["predictions"],
measure = "ca")
string = "Classification Accuracy \n"
for k, v in result_iterator(results):
for k, v in results:
string += str(v) + "\n"
input_dict["string"] = string
......@@ -67,7 +66,7 @@ def bigdata_mse(request,input_dict,output_dict,widget):
predictions = input_dict["predictions"],
measure = "mse")
string = "Mean squared error\n"
for k, v in result_iterator(results):
for k, v in results:
string += str(v) + "\n"
input_dict["string"] = string
......
......@@ -9,8 +9,7 @@ import re
import itertools
import subprocess
def definition_sentences2(input_dict):
return {}
webservices_totrtale_url = "http://vihar.ijs.si:8104"
def merge_sentences(input_dict):
"""
......@@ -89,20 +88,19 @@ def load_corpus2(input_dict):
'''
use_text = input_dict["use_text"] == "true"
if use_text:
if use_text: #checkbox is checked
fname = "input_string.txt"
text = input_dict[u"text"].strip()
if len(text) == 0:
raise Exception("Please input text or uncheck the Use text checkbox.")
data = base64.b64encode(text)
else:
else: #checkbox is not checked
f = safeOpen(input_dict['file'])
fname = os.path.basename(input_dict['file'])
data = base64.b64encode(f.read())
#define web service
webservices_url = "http://vihar.ijs.si:8104"
webservice_url = webservices_url + "/parseFile"
webservice_url = webservices_totrtale_url + "/parseFile"
params = {"filename": fname, "text": data} #set params
#call web service
......@@ -115,7 +113,10 @@ def load_corpus2(input_dict):
"""
return {'corpus': content[u"resp"]}
def parse_xml(path, lemma_name = "lemma", pos_name = "ana", word_tag = "w", sentence_tag = "s"):
def parse_tei(path, lemma_name = "lemma", pos_name = "ana", word_tag = "w", sentence_tag = "s"):
"""
Helper function for load tagged corpus. Function parses TEI format.
"""
from xml.dom import minidom
fname = os.path.basename(path)
......@@ -141,6 +142,9 @@ def parse_xml(path, lemma_name = "lemma", pos_name = "ana", word_tag = "w", sent
return "".join(tab_separated_output).encode("utf8", "ignore")
def parse_tab_separated(path, word_index, token_index, lemma_index, pos_index, start_tag, end_tag, separator):
"""
Helper function for load tagged corpus. Function parses tab separated format.
"""
fname = os.path.basename(path)
f = safeOpen(path)
......@@ -152,7 +156,7 @@ def parse_tab_separated(path, word_index, token_index, lemma_index, pos_index, s
sentence_counter = 0
for line in f:
splitted_line = re.split(separator, line.strip())#.split(separator)
splitted_line = re.split(separator, line.strip())
if len(splitted_line) >= 4:
new_line = splitted_line[word_index] + "\t" + splitted_line[token_index] + "\t" + splitted_line[lemma_index] + "\t" + splitted_line[pos_index] + "\t\n"
data.append(new_line)
......@@ -175,11 +179,10 @@ def parse_tab_separated(path, word_index, token_index, lemma_index, pos_index, s
def load_tagged_corpus(input_dict):
"""
Loads TEI file, which is output of totrtale
Loads a file in TEI or XML format.
"""
data = ""
if input_dict["input_format"] == "tab_format":
try:
word_index = int(input_dict["word_index"]) - 1
......@@ -205,16 +208,6 @@ def load_tagged_corpus(input_dict):
data = parse_tab_separated(input_dict['file'], word_index=word_index, token_index=token_index, lemma_index=lemma_index, pos_index=pos_index, start_tag=start_tag, end_tag=end_tag, separator=separator)
else:
#fname = os.path.basename(input_dict['file'])
#data = f.read()
#path = os.path.dirname(os.path.abspath(__file__)) + os.sep
#subprocess.call(["java -jar " + path+"jing.jar " + path+ "tei_imp.rng <" + data + " >" + "out.txt"],shell=True)
#f = open("out.txt", "r")
#error = f.read()
#if len(error) > 0:
# raise Exception(error)
lemma_name = input_dict["lemma_name"]
pos_name = input_dict["pos_name"]
sentence_tag = input_dict["sentence_tag"]
......@@ -223,18 +216,19 @@ def load_tagged_corpus(input_dict):
if len(lemma_name) < 1 or len(pos_name) < 1 or len(sentence_tag) < 1 or len(word_tag) < 1:
raise Exception("Please review parameters for TEI format.")
data = parse_xml(input_dict['file'], lemma_name = lemma_name, pos_name = pos_name, word_tag = word_tag, sentence_tag = sentence_tag)
data = parse_tei(input_dict['file'], lemma_name = lemma_name, pos_name = pos_name, word_tag = word_tag, sentence_tag = sentence_tag)
return {'annotations': data}
def totrtale_request(params):
webservices_url = "http://vihar.ijs.si:8104"
webservice_url = webservices_url + "/runToTrTaLe"
webservice_url = webservices_totrtale_url + "/runToTrTaLe"
return post(webservice_url, params=params)