Commit 3e71fbd1 authored by Janez's avatar Janez

Merge branch 'totrtale_multiproc' of /home/git/repositories/kt/mothra

parents 98cd3217 b0d8c8d9
......@@ -36,6 +36,8 @@ def file_url(input_dict):
X_meta = X_meta,
generate_urls = True if input_dict["range"] == "true" else False,
**input_dict)
print data.params
return {"dataset" : data}
......@@ -49,11 +51,11 @@ def big_data_apply_classifier(input_dict):
return linsvm_predict(input_dict)
elif "kmeans_fitmodel" in input_dict["fitmodel_url"]:
return kmeans_predict(input_dict)
elif "dt_fitmodel" in input_dict["fitmodel_url"]:
elif "fddt_fitmodel" in input_dict["fitmodel_url"]:
return dt_predict(input_dict)
elif "rf_fitmodel" in input_dict["fitmodel_url"]:
elif "drf_fitmodel" in input_dict["fitmodel_url"]:
return rf_predict(input_dict)
elif "wrf_fitmodel" in input_dict["fitmodel_url"]:
elif "dwfr_fitmodel" in input_dict["fitmodel_url"]:
return wrf_predict(input_dict)
elif "linreg_fitmodel" in input_dict["fitmodel_url"]:
return linreg_predict(input_dict)
......@@ -72,14 +74,14 @@ def dt_fit(input_dict):
from discomll.ensemble import forest_distributed_decision_trees
random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"])
bootstrap = input_dict["bootstrap"] == "true"
fitmodel_url = forest_distributed_decision_trees.fit(input = input_dict["dataset"],
trees_per_chunk = input_dict["trees_per_subset"],
max_tree_nodes = input_dict["tree_nodes"],
min_samples_leaf = input_dict["min_samples_leaf"],
min_samples_split = input_dict["min_samples_split"],
class_majority = input_dict["majority"],
bootstrap = bootstrap,
bootstrap = input_dict["bootstrap"] == "true",
measure = input_dict["measure"],
accuracy = input_dict["accuracy"],
separate_max = input_dict["separate_max"],
......@@ -102,11 +104,13 @@ def rf_fit(input_dict):
fitmodel_url = distributed_random_forest.fit(input = input_dict["dataset"],
trees_per_chunk = input_dict["trees_per_subset"],
max_tree_nodes = input_dict["tree_nodes"],
leaf_min_inst = input_dict["leaf_min_inst"],
class_majority = input_dict["majority"],
max_tree_nodes = input_dict["tree_nodes"],
min_samples_leaf = input_dict["min_samples_leaf"],
min_samples_split = input_dict["min_samples_split"],
class_majority = input_dict["majority"],
measure = input_dict["measure"],
split_fun = input_dict["split_fun"],
accuracy = input_dict["accuracy"],
separate_max = input_dict["separate_max"],
random_state = random_state,
save_results = True)
......@@ -119,32 +123,35 @@ def rf_predict(input_dict):
predictions_url = distributed_random_forest.predict(input = input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"],
diff = input_dict["diff"],
random_state = random_state,
save_results = True)
return {"string": predictions_url}
def wrf_fit(input_dict):
from discomll.ensemble import distributed_weighted_forest
from discomll.ensemble import distributed_weighted_forest_rand
random_state = None if input_dict["seed"] == "None" else int(input_dict["seed"])
fitmodel_url = distributed_weighted_forest.fit(input = input_dict["dataset"],
fitmodel_url = distributed_weighted_forest_rand.fit(input = input_dict["dataset"],
trees_per_chunk = input_dict["trees_per_subset"],
max_tree_nodes = input_dict["tree_nodes"],
leaf_min_inst = input_dict["leaf_min_inst"],
class_majority = input_dict["majority"],
max_tree_nodes = input_dict["tree_nodes"],
num_medoids = input_dict["num_medoids"],
min_samples_leaf = input_dict["min_samples_leaf"],
min_samples_split = input_dict["min_samples_split"],
class_majority = input_dict["majority"],
measure = input_dict["measure"],
split_fun = input_dict["split_fun"],
save_results = True,
random_state = random_state)
accuracy = input_dict["accuracy"],
separate_max = input_dict["separate_max"],
random_state = random_state,
save_results = True)
return {"fitmodel_url" : fitmodel_url}
def wrf_predict(input_dict):
from discomll.ensemble import distributed_weighted_forest
from discomll.ensemble import distributed_weighted_forest_rand
predictions_url = distributed_weighted_forest.predict(input = input_dict["dataset"],
predictions_url = distributed_weighted_forest_rand.predict(input = input_dict["dataset"],
fitmodel_url = input_dict["fitmodel_url"],
coeff = input_dict["coeff"],
save_results = True)
return {"string": predictions_url}
......
......@@ -28,26 +28,60 @@
"model": "workflows.abstractinput",
"fields": {
"widget": "4f2ce923-62e6-4be1-a394-72ac52988386",
"name": "Discretization",
"short_name": "spf",
"default": "equal_freq",
"description": "Select equal frequency discretization or random discretization for numeric attributes",
"name": "Discretization accuracy",
"short_name": "dac",
"default": "1",
"description": "Continuous attributes are converted to discrete intervals. For exact estimation use 0 (slowest) or increase the number to get an approximation (faster).",
"required": true,
"multi": false,
"parameter_type": "select",
"variable": "split_fun",
"parameter_type": "text",
"variable": "accuracy",
"parameter": true,
"order": 7,
"order": 8,
"uid": "00758cdf-2eb5-43c5-bedf-bd3b8b9c29d6"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "4f2ce923-62e6-4be1-a394-72ac52988386",
"name": "Separate most represented class",
"short_name": "smp",
"default": "true",
"description": "separate_max",
"required": true,
"multi": false,
"parameter_type": "checkbox",
"variable": "separate_max",
"parameter": true,
"order": 9,
"uid": "21444978-142f-4f3d-947c-20e0b41a2c9b"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "4f2ce923-62e6-4be1-a394-72ac52988386",
"name": "Min samples in leaf",
"short_name": "msl",
"default": "5",
"description": "The minimum number of samples in newly created leaves. A split is discarded if after the split, one of the leaves would contain less then min samples leaf samples",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "min_samples_leaf",
"parameter": true,
"order": 5,
"uid": "52591706-7f30-4def-a788-3e07d3f82876"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "4f2ce923-62e6-4be1-a394-72ac52988386",
"name": "Max tree nodes",
"short_name": "mnt",
"default": "20",
"default": "100",
"description": "Max. number of decision tree nodes",
"required": true,
"multi": false,
......@@ -88,7 +122,7 @@
"parameter_type": "select",
"variable": "measure",
"parameter": true,
"order": 6,
"order": 7,
"uid": "68cbccf9-7469-4b55-b96e-4f7c6a3c9cde"
}
},
......@@ -105,7 +139,7 @@
"parameter_type": "text",
"variable": "seed",
"parameter": true,
"order": 8,
"order": 10,
"uid": "8e6e2d96-3457-4b23-ac93-ab90b083920f"
}
},
......@@ -132,7 +166,7 @@
"widget": "4f2ce923-62e6-4be1-a394-72ac52988386",
"name": "Min samples split",
"short_name": "lmi",
"default": "5",
"default": "10",
"description": "Min. number of samples to split the node",
"required": true,
"multi": false,
......@@ -156,7 +190,7 @@
"parameter_type": "text",
"variable": "majority",
"parameter": true,
"order": 5,
"order": 6,
"uid": "fe7f5d5a-c2e2-4ae9-b138-18b1de7c4e93"
}
},
......
......@@ -37,10 +37,44 @@
"parameter_type": "text",
"variable": "majority",
"parameter": true,
"order": 5,
"order": 7,
"uid": "1b23ead1-b104-4d27-a6fd-b23de6efa28f"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Randomly chosen medoids",
"short_name": "rcm",
"default": "3",
"description": "Number of randomly chosen medoids to calculate similaty.",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "num_medoids",
"parameter": true,
"order": 4,
"uid": "1bbcbc2c-a9d5-4427-a8ef-e4dd58c22f86"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Separate most represented class",
"short_name": "smp",
"default": "true",
"description": "",
"required": true,
"multi": false,
"parameter_type": "checkbox",
"variable": "separate_max",
"parameter": true,
"order": 10,
"uid": "2ccff5c1-7e06-4887-863d-7acf76209e50"
}
},
{
"model": "workflows.abstractinput",
"fields": {
......@@ -54,7 +88,7 @@
"parameter_type": "text",
"variable": "seed",
"parameter": true,
"order": 8,
"order": 11,
"uid": "31c68e34-3bff-41bb-bf77-925c6171a6f6"
}
},
......@@ -75,13 +109,30 @@
"uid": "37879268-0aa9-4458-afb2-71a521acb299"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Min samples in leaf",
"short_name": "msl",
"default": "5",
"description": "The minimum number of samples in newly created leaves. A split is discarded if after the split, one of the leaves would contain less then min samples leaf samples",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "min_samples_leaf",
"parameter": true,
"order": 6,
"uid": "3a893a69-f22e-448b-9a92-222573c655ba"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Max tree nodes",
"short_name": "mnt",
"default": "20",
"default": "100",
"description": "Max. number of decision tree nodes",
"required": true,
"multi": false,
......@@ -92,6 +143,23 @@
"uid": "3d48b0d0-a304-45d5-9d18-3ca17e8fcf05"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Discretization accuracy",
"short_name": "dac",
"default": "1",
"description": "Continuous attributes are converted to discrete intervals. For exact estimation use 0 (slowest) or increase the number to get an approximation (faster).",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "accuracy",
"parameter": true,
"order": 9,
"uid": "3ff0f040-3d11-413f-975a-1fde57bf289b"
}
},
{
"model": "workflows.abstractinput",
"fields": {
......@@ -122,7 +190,7 @@
"parameter_type": "select",
"variable": "measure",
"parameter": true,
"order": 6,
"order": 8,
"uid": "9a8f3c2c-265c-4b37-93c1-d58fee9dd7af"
}
},
......@@ -132,34 +200,17 @@
"widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Min samples split",
"short_name": "lmi",
"default": "5",
"default": "10",
"description": "Min. number of samples to split the node",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "leaf_min_inst",
"parameter": true,
"order": 4,
"order": 5,
"uid": "ac032f38-f4a4-44ea-8c02-96506d4f8e86"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "72a39fab-5433-493f-ae22-12a264075356",
"name": "Discretization",
"short_name": "spf",
"default": "equal_freq",
"description": "Select equal frequency discretization or random discretization for numeric attributes",
"required": true,
"multi": false,
"parameter_type": "select",
"variable": "split_fun",
"parameter": true,
"order": 7,
"uid": "c43bfc92-e1af-42fc-8d73-f7348ebdaf40"
}
},
{
"model": "workflows.abstractoutput",
"fields": {
......@@ -189,23 +240,5 @@
"value": "info_gain",
"abstract_input": "9a8f3c2c-265c-4b37-93c1-d58fee9dd7af"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "Equal frequency discretization",
"uid": "4ea5c55a-92a8-4541-a1cc-9aabb0fd82c0",
"value": "equal_freq",
"abstract_input": "c43bfc92-e1af-42fc-8d73-f7348ebdaf40"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "Random discretization",
"uid": "838f798d-e00e-4216-8990-ebc3c1929c0e",
"value": "random",
"abstract_input": "c43bfc92-e1af-42fc-8d73-f7348ebdaf40"
}
}
]
\ No newline at end of file
......@@ -71,7 +71,7 @@
"parameter_type": "select",
"variable": "measure",
"parameter": true,
"order": 7,
"order": 8,
"uid": "28f53666-76b0-4d44-acab-0824e603a848"
}
},
......@@ -88,7 +88,7 @@
"parameter_type": "text",
"variable": "seed",
"parameter": true,
"order": 10,
"order": 11,
"uid": "40bc0e36-427f-4517-ac56-55ef033a0e9c"
}
},
......@@ -105,7 +105,7 @@
"parameter_type": "text",
"variable": "accuracy",
"parameter": true,
"order": 8,
"order": 9,
"uid": "45a0c36c-d61a-4708-b54a-6908494ee090"
}
},
......@@ -122,7 +122,7 @@
"parameter_type": "text",
"variable": "min_samples_leaf",
"parameter": true,
"order": 5,
"order": 6,
"uid": "739e0f16-2ac9-423e-8050-58778553ca48"
}
},
......@@ -181,7 +181,7 @@
"model": "workflows.abstractinput",
"fields": {
"widget": "74df0d6e-684f-46ae-975d-ba1ce5425066",
"name": "Separate most present class",
"name": "Separate most represented class",
"short_name": "smp",
"default": "true",
"description": "",
......@@ -190,7 +190,7 @@
"parameter_type": "checkbox",
"variable": "separate_max",
"parameter": true,
"order": 9,
"order": 10,
"uid": "eac59b1f-c35d-4116-b4d7-9320d2b4b351"
}
},
......@@ -207,7 +207,7 @@
"parameter_type": "text",
"variable": "majority",
"parameter": true,
"order": 6,
"order": 7,
"uid": "f31f0f86-238b-4ce1-b7e7-1ad6e88f55b0"
}
},
......
......@@ -71,7 +71,7 @@
"parameter_type": "text",
"variable": "seed",
"parameter": true,
"order": 4,
"order": 3,
"uid": "8ec2b906-2b9e-4cda-9455-09ccd7d134fb"
}
},
......@@ -96,16 +96,16 @@
"model": "workflows.abstractinput",
"fields": {
"widget": "d9de579e-fdd9-47da-948e-69183f24340f",
"name": "Random forest - difference",
"short_name": "dff",
"default": "0.3",
"description": "Random forest calculates difference in probability between most and second most probable prediction. If difference is greater than parameter diff, it outputs prediction. If a test sample is hard to predict (difference is never higher than diff), it queries whole ensemble to make a prediction.",
"name": "Weighted forest - similarity coeff",
"short_name": "coe",
"default": "0.5",
"description": "Percentage of most similar treees to include in prediction (0 - 1)",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "diff",
"variable": "coeff",
"parameter": true,
"order": 3,
"order": 4,
"uid": "d979ff23-eb11-40cf-9d81-2a71ddf5d790"
}
},
......
......@@ -15,7 +15,6 @@ def measure_distribution(request,input_dict,output_dict,widget):
def bigdata_ca(request,input_dict,output_dict,widget):
from discomll.utils import accuracy
from disco.core import result_iterator
import os.path
from mothra.settings import MEDIA_ROOT
from workflows.helpers import ensure_dir
......@@ -32,7 +31,7 @@ def bigdata_ca(request,input_dict,output_dict,widget):
predictions = input_dict["predictions"],
measure = "ca")
string = "Classification Accuracy \n"
for k, v in result_iterator(results):
for k, v in results:
string += str(v) + "\n"
input_dict["string"] = string
......@@ -67,7 +66,7 @@ def bigdata_mse(request,input_dict,output_dict,widget):
predictions = input_dict["predictions"],
measure = "mse")
string = "Mean squared error\n"
for k, v in result_iterator(results):
for k, v in results:
string += str(v) + "\n"
input_dict["string"] = string
......
......@@ -9,8 +9,7 @@ import re
import itertools
import subprocess
def definition_sentences2(input_dict):
return {}
webservices_totrtale_url = "http://vihar.ijs.si:8104"
def merge_sentences(input_dict):
"""
......@@ -89,20 +88,19 @@ def load_corpus2(input_dict):
'''
use_text = input_dict["use_text"] == "true"
if use_text:
if use_text: #checkbox is checked
fname = "input_string.txt"
text = input_dict[u"text"].strip()
if len(text) == 0:
raise Exception("Please input text or uncheck the Use text checkbox.")
data = base64.b64encode(text)
else:
else: #checkbox is not checked
f = safeOpen(input_dict['file'])
fname = os.path.basename(input_dict['file'])
data = base64.b64encode(f.read())
#define web service
webservices_url = "http://vihar.ijs.si:8104"
webservice_url = webservices_url + "/parseFile"
webservice_url = webservices_totrtale_url + "/parseFile"
params = {"filename": fname, "text": data} #set params
#call web service
......@@ -115,7 +113,10 @@ def load_corpus2(input_dict):
"""
return {'corpus': content[u"resp"]}
def parse_xml(path, lemma_name = "lemma", pos_name = "ana", word_tag = "w", sentence_tag = "s"):
def parse_tei(path, lemma_name = "lemma", pos_name = "ana", word_tag = "w", sentence_tag = "s"):
"""
Helper function for load tagged corpus. Function parses TEI format.
"""
from xml.dom import minidom
fname = os.path.basename(path)
......@@ -141,6 +142,9 @@ def parse_xml(path, lemma_name = "lemma", pos_name = "ana", word_tag = "w", sent
return "".join(tab_separated_output).encode("utf8", "ignore")
def parse_tab_separated(path, word_index, token_index, lemma_index, pos_index, start_tag, end_tag, separator):
"""
Helper function for load tagged corpus. Function parses tab separated format.
"""
fname = os.path.basename(path)
f = safeOpen(path)
......@@ -152,7 +156,7 @@ def parse_tab_separated(path, word_index, token_index, lemma_index, pos_index, s
sentence_counter = 0
for line in f:
splitted_line = re.split(separator, line.strip())#.split(separator)
splitted_line = re.split(separator, line.strip())
if len(splitted_line) >= 4:
new_line = splitted_line[word_index] + "\t" + splitted_line[token_index] + "\t" + splitted_line[lemma_index] + "\t" + splitted_line[pos_index] + "\t\n"
data.append(new_line)
......@@ -175,11 +179,10 @@ def parse_tab_separated(path, word_index, token_index, lemma_index, pos_index, s
def load_tagged_corpus(input_dict):
"""
Loads TEI file, which is output of totrtale
Loads a file in TEI or XML format.
"""
data = ""
if input_dict["input_format"] == "tab_format":
try:
word_index = int(input_dict["word_index"]) - 1
......@@ -205,16 +208,6 @@ def load_tagged_corpus(input_dict):
data = parse_tab_separated(input_dict['file'], word_index=word_index, token_index=token_index, lemma_index=lemma_index, pos_index=pos_index, start_tag=start_tag, end_tag=end_tag, separator=separator)
else:
#fname = os.path.basename(input_dict['file'])
#data = f.read()
#path = os.path.dirname(os.path.abspath(__file__)) + os.sep
#subprocess.call(["java -jar " + path+"jing.jar " + path+ "tei_imp.rng <" + data + " >" + "out.txt"],shell=True)
#f = open("out.txt", "r")
#error = f.read()
#if len(error) > 0:
# raise Exception(error)
lemma_name = input_dict["lemma_name"]
pos_name = input_dict["pos_name"]
sentence_tag = input_dict["sentence_tag"]
......@@ -223,18 +216,19 @@ def load_tagged_corpus(input_dict):
if len(lemma_name) < 1 or len(pos_name) < 1 or len(sentence_tag) < 1 or len(word_tag) < 1:
raise Exception("Please review parameters for TEI format.")
data = parse_xml(input_dict['file'], lemma_name = lemma_name, pos_name = pos_name, word_tag = word_tag, sentence_tag = sentence_tag)
data = parse_tei(input_dict['file'], lemma_name = lemma_name, pos_name = pos_name, word_tag = word_tag, sentence_tag = sentence_tag)
return {'annotations': data}
def totrtale_request(params):
webservices_url = "http://vihar.ijs.si:8104"
webservice_url = webservices_url + "/runToTrTaLe"
webservice_url = webservices_totrtale_url + "/runToTrTaLe"
return post(webservice_url, params=params)
def nlp_totrtale2(input_dict, widget):
'''
Calls the totrtale web service.
Function splits huge documents in smaller pieces and sends them separatly to totrtale webservice. If there is multiple smaller documents, this functions groups them and sends them together.
'''
import multiprocessing
from xml.dom.minidom import parseString
......@@ -242,16 +236,15 @@ def nlp_totrtale2(input_dict, widget):
import math
import copy
progress_accumulator = 0
widget.progress= progress_accumulator
progress_accumulator = 0 #progress for progress bar
widget.progress= progress_accumulator
widget.save()
processes = 4
DOCUMENTS_SIZE = 3 * int(1e6) #Document size (MB) per process
SINGLE_DOC_SIZE = 1 * int(1e6)
processes = 4 #number of processes for multiprocessing
DOCUMENTS_SIZE = 3 * int(1e6) #size of a group of documents in MB per process
SINGLE_DOC_SIZE = 1 * int(1e6) #size of a single document per process
corpus = parseString(input_dict['corpus'])
language = input_dict['lang'],
postprocess = input_dict['postprocess'] == "true"
xml = input_dict['xml'] == "true"
......@@ -270,15 +263,14 @@ def nlp_totrtale2(input_dict, widget):
pool = multiprocessing.Pool(processes=processes)
documents = corpus.getElementsByTagName('TEI')
documents_size, document_num, process_num = 0, 0, 1
#titles = []
results, docs, single_docs = [], [], []
for i, document in enumerate(documents):
doc_len = len(document.getElementsByTagName('body')[0].getElementsByTagName('p')[0].childNodes[0].nodeValue)
doc_title = document.getElementsByTagName('title')[0].firstChild.nodeValue
#titles.append(doc_title)
print doc_title
if doc_len > SINGLE_DOC_SIZE:
#split single huge document
predhead = '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
title = '<title>' + doc_title + '</title>\n'
......@@ -319,6 +311,7 @@ def nlp_totrtale2(input_dict, widget):
single_docs.append(2)
print "document was split",doc_title, len(single_docs)
else:
#group multiple smaller documents.
docs.append(document.toxml())
document_num+=1
documents_size += doc_len
......@@ -336,11 +329,9 @@ def nlp_totrtale2(input_dict, widget):
single_docs.append(-1)
pool.close()
#we need to join results of totrtale processing back together. Funtion also updates progress bar.
response = ["" for i in results]
progress = [True]
while any(progress):
time.sleep(1)
progress = [not result.ready() for result in results]
......@@ -356,6 +347,7 @@ def nlp_totrtale2(input_dict, widget):
progress = [False]
raise Exception(resp["error"])
if xml:
#results are in xml
if single_docs[i] == 0:
print "remove back", i
pos1 = resp["resp"].find("<s>")
......@@ -374,6 +366,7 @@ def nlp_totrtale2(input_dict, widget):
print "nothing to remove"
response[i] = resp["resp"]
else:
#results are tab separated
if single_docs[i] in [0,1]:
pos2 = resp["resp"].find("</TEXT>")
response[i] = resp["resp"][:pos2]
......@@ -388,7 +381,8 @@ def nlp_totrtale2(input_dict, widget):
widget.save()
pool.join()