Commit 901c3e49 authored by hiphop's avatar hiphop

new widgets

parent 9b4e3001
......@@ -8,9 +8,6 @@ import json
import re
import itertools
webservices_url = "http://vihar.ijs.si:8104"
def merge_sentences(input_dict):
"""
Merges the input sentences in XML according to the specified method.
......@@ -57,27 +54,33 @@ def load_corpus2(input_dict):
'''
Parses an input file and encodes it in base 64.
'''
use_text = input_dict["use_text"] == "true"
if input_dict[u"text"] == "":
if use_text:
fname = "input_string.txt"
text = input_dict[u"text"].strip()
if len(text) == 0:
raise Exception("Please input text or uncheck the Use text checkbox.")
data = base64.b64encode(text)
else:
f = safeOpen(input_dict['file'])
fname = os.path.basename(input_dict['file'])
data = base64.b64encode(f.read())
else:
fname = "input_string.txt"
data = base64.b64encode(input_dict[u"text"].strip())
#define web service
webservices_url = "http://vihar.ijs.si:8104"
webservice_url = webservices_url + "/parseFile"
params = {"filename": fname, "text": data} #set params
#call web service
resp = post(webservice_url, params=params)
content = json.loads(resp.content)[u'parseFileResponse'][u'parseFileResult']
"""
if content[u"error"] != "":
raise Exception(content[u"error"])
else:
return {'corpus': content[u"resp"]}
"""
return {'corpus': content[u"resp"]}
def load_tagged_corpus(input_dict):
"""
......@@ -90,6 +93,7 @@ def load_tagged_corpus(input_dict):
return {'annotations': data}
def totrtale_request(params):
webservices_url = "http://vihar.ijs.si:8104"
webservice_url = webservices_url + "/runToTrTaLe"
return post(webservice_url, params=params)
......@@ -163,7 +167,7 @@ def nlp_totrtale2(input_dict, widget):
sub_params["text"] = predhead +title + head + document_text[prev_j: curr_j+2] +tail
else:
sub_params["text"] = predhead + head + document_text[prev_j: curr_j+2] + tail
results.append(pool.apply_async(totrtale_request, args=[sub_params]))
if prev_j == 0:
single_docs.append(0)
......@@ -188,7 +192,6 @@ def nlp_totrtale2(input_dict, widget):
documents_size += doc_len
if documents_size > DOCUMENTS_SIZE or (document_num) % 10==0 or i == len(documents)-1:
#print "Log:",process_num, "process added to queue with", document_num, "documents"
documents_size = 0
document_num = 0
sub_params = copy.deepcopy(params)
......@@ -200,6 +203,8 @@ def nlp_totrtale2(input_dict, widget):
single_docs.append(-1)
pool.close()
response = ["" for i in results]
progress = [True]
......@@ -237,8 +242,6 @@ def nlp_totrtale2(input_dict, widget):
response[i] = resp["resp"]
else:
if single_docs[i] in [0,1]:
#print "remove back", i, single_docs[i]
#pos1 = resp["resp"].find("<p>")
pos2 = resp["resp"].find("</TEXT>")
response[i] = resp["resp"][:pos2]
else:
......@@ -302,6 +305,70 @@ def nlp_term_extraction(input_dict):
threshold=0)
return {'candidates': response['candidates']}
def get_default_stop_word_list(lang):
if lang == "en":
return ["et al"]
elif lang == "sl":
return ["itd", "slon", "ovira", "zob"]
def nlp_term_extraction2(input_dict):
'''
Term extraction from totrtale annotations.
'''
ws_url = "http://vihar.ijs.si:8081/call"
annotations = input_dict['annotations']
lang = input_dict['lang']
stop_list_checkbox = input_dict["stop_list"] == "true"
user_stop_words = []
if input_dict['stop_words_file'] != "":
user_stop_words = safeOpen(input_dict['stop_words_file']).read()
try:
user_stop_words.decode("utf-8")
except Exception:
raise Exception("Please make sure that your stop words list is encoded in UTF-8.")
user_stop_words = user_stop_words.split("\n")
if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
annotations = TEItoTab(annotations)
if lang == "sl":
reference_corpus = input_dict["slovene_reference_corpus"]
elif lang == "en":
reference_corpus = input_dict["english_reference_corpus"]
params = {"corpus":annotations,
"lang": lang,
"reference_corpus":reference_corpus}
response = post(ws_url, params=params)
resp = json.loads(response.content)[u'callResponse'][u'callResult']
stop_list = []
if stop_list_checkbox:
stop_list = get_default_stop_word_list(lang)
stop_list = set(stop_list + user_stop_words)
if len(stop_list) > 0:
resp = resp.split("\n")
i=0
while i < len(resp):
increase = True
line = resp[i]
if len(line) > 0:
term = line.split("\t")[1][1:-1]
for word in term.split(" "):
if word.lower() in stop_list:
increase = False
resp.pop(i)
break
if increase:
i+=1
resp = "\n".join(resp)
return {'candidates': resp}
def nlp_def_extraction_patterns(input_dict):
'''
......@@ -320,6 +387,26 @@ def nlp_def_extraction_patterns(input_dict):
lang=lang, pattern=pattern)
return {'sentences': response['candidates']}
def nlp_def_extraction_patterns2(input_dict):
'''
Definition extraction using pre-defined patterns.
'''
annotations = input_dict['annotations']
lang = input_dict['lang']
pattern = input_dict['pattern']
if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
annotations = TEItoTab(annotations)
ws_url = "http://vihar.ijs.si:8081/patDefSent"
params = {"corpus":annotations,
"pattern":pattern,
"lang":lang}
response = post(ws_url, params=params)
response = json.loads(response.content)[u'patDefSentResponse'][u'patDefSentResult']
return {'sentences': response}
def nlp_def_extraction_terms(input_dict):
'''
......@@ -348,6 +435,40 @@ def nlp_def_extraction_terms(input_dict):
num_multiterms=num_multiterms, term_beginning=term_beginning)
return {'sentences': response['candidates']}
def nlp_def_extraction_terms2(input_dict):
'''
Definition extraction using terms.
'''
annotations = input_dict['annotations']
term_candidates = input_dict['term_candidates']
lang = input_dict['lang']
terms_per_sentence = input_dict['terms_per_sentence']
nominatives = input_dict['nominatives']
threshold = input_dict['threshold']
verb_two_terms = input_dict['verb_two_terms']
multiword_term = input_dict['multiword_term']
num_multiterms = input_dict['num_multiterms']
term_beginning = input_dict['term_beginning']
if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
annotations = TEItoTab(annotations)
ws_url = "http://vihar.ijs.si:8081/termDefSent"
params = {"corpus":annotations,
"candidates":term_candidates,
"lang":lang,
"nominatives":nominatives,
"terms_per_sentence":terms_per_sentence,
"select": threshold,
"verb_two_terms":verb_two_terms,
"multiword_term":multiword_term,
"num_multiterms":num_multiterms,
"term_beginning":term_beginning}
response = post(ws_url, params=params)
response = json.loads(response.content)[u'termDefSentResponse'][u'termDefSentResult']
return {'sentences': response}
def nlp_def_extraction_wnet(input_dict):
'''
......@@ -364,6 +485,22 @@ def nlp_def_extraction_wnet(input_dict):
response = ws.client.GlossaryExtractionByWnet(corpus=annotations, lang=lang)
return {'sentences': response['candidates']}
def nlp_def_extraction_wnet2(input_dict):
'''
Definition extraction using WordNet.
'''
annotations = input_dict['annotations']
lang = input_dict['lang']
if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
annotations = TEItoTab(annotations)
ws_url = "http://vihar.ijs.si:8081/wnetDefSent"
params = {"corpus":annotations,
"lang":lang}
response = post(ws_url, params=params)
response = json.loads(response.content)[u'wnetDefSentResponse'][u'wnetDefSentResult']
return {'sentences': response}
def TEItoTab(text):
mask1 = ["\tTOK\t", "\t", "\t\n"]
......@@ -400,4 +537,6 @@ def TEItoTab(text):
newText.append("<TEXT title=" + title + ">\t\n")
elif "</body>" in l:
newText.append("</TEXT>\t\n")
return "".join(newText)
\ No newline at end of file
return "".join(newText)
\ No newline at end of file
[
{
"model": "workflows.abstractwidget",
"fields": {
"category": "9a30eafc-37b8-48f2-8a92-692c4b324dff",
"treeview_image": "",
"uid": "09d72ebd-2eaf-47f6-95e0-2f2ed0c51ea2",
"is_streaming": false,
"package": "nlp",
"interaction_view": "",
"has_progress_bar": false,
"image": "",
"description": "",
"static_image": "nlp.png",
"action": "nlp_def_extraction_terms2",
"visualization_view": "",
"streaming_visualization_view": "",
"post_interact_action": "",
"wsdl_method": "",
"wsdl": "",
"interactive": false,
"windows_queue": false,
"order": 1,
"name": "Definition extraction by terms2"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "09d72ebd-2eaf-47f6-95e0-2f2ed0c51ea2",
"name": "First term should be a multi-word term",
"short_name": "mwt",
"default": "false",
"description": "First term should be a multi-word term",
"required": false,
"multi": false,
"parameter_type": "checkbox",
"variable": "multiword_term",
"parameter": true,
"order": 7,
"uid": "25c57fa3-b085-4edf-a0ce-674f1dcd90f2"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "09d72ebd-2eaf-47f6-95e0-2f2ed0c51ea2",
"name": "Annotations",
"short_name": "ann",
"default": "",
"description": "Totrtale annotations",
"required": true,
"multi": false,
"parameter_type": null,
"variable": "annotations",
"parameter": false,
"order": 10,
"uid": "2ff7416f-417f-4033-ac66-5e9a386d1a5a"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "09d72ebd-2eaf-47f6-95e0-2f2ed0c51ea2",
"name": "Threshold top % terms",
"short_name": "thr",
"default": "2%",
"description": "Threshold",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "threshold",
"parameter": true,
"order": 5,
"uid": "929e4938-f51d-45cd-b8f1-0cbe6ea0d790"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "09d72ebd-2eaf-47f6-95e0-2f2ed0c51ea2",
"name": "Multi-terms in a sentence",
"short_name": "mtn",
"default": "1",
"description": "Multi-terms in a sentence",
"required": false,
"multi": false,
"parameter_type": "text",
"variable": "num_multiterms",
"parameter": true,
"order": 8,
"uid": "a49780d2-43cf-4bcb-b6dd-af867b2efb5d"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "09d72ebd-2eaf-47f6-95e0-2f2ed0c51ea2",
"name": "Language",
"short_name": "lan",
"default": "sl",
"description": "",
"required": true,
"multi": false,
"parameter_type": "select",
"variable": "lang",
"parameter": true,
"order": 2,
"uid": "b8b9fe35-d7be-46fb-953a-95a08b84b875"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "09d72ebd-2eaf-47f6-95e0-2f2ed0c51ea2",
"name": "One term should occur at the beginning of the sentence",
"short_name": "tbs",
"default": "false",
"description": "One term should occur at the beginning of the sentence",
"required": false,
"multi": false,
"parameter_type": "checkbox",
"variable": "term_beginning",
"parameter": true,
"order": 9,
"uid": "c17f11e0-f35d-4998-ba7e-dd7c941d1a8c"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "09d72ebd-2eaf-47f6-95e0-2f2ed0c51ea2",
"name": "Verb between two terms",
"short_name": "vtt",
"default": "None",
"description": "",
"required": true,
"multi": false,
"parameter_type": "select",
"variable": "verb_two_terms",
"parameter": true,
"order": 6,
"uid": "c69d46ff-b14b-4d16-b231-7c8ad6ca6797"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "09d72ebd-2eaf-47f6-95e0-2f2ed0c51ea2",
"name": "Nominatives",
"short_name": "nom",
"default": "0",
"description": "",
"required": true,
"multi": false,
"parameter_type": "select",
"variable": "nominatives",
"parameter": true,
"order": 4,
"uid": "cb415d1d-10c0-4a6e-94bc-bea998b920b5"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "09d72ebd-2eaf-47f6-95e0-2f2ed0c51ea2",
"name": "Candidates",
"short_name": "can",
"default": "",
"description": "Term candidates",
"required": true,
"multi": false,
"parameter_type": null,
"variable": "term_candidates",
"parameter": false,
"order": 1,
"uid": "cdd3a7c9-971e-4c19-8fc3-e6bafb42c916"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "09d72ebd-2eaf-47f6-95e0-2f2ed0c51ea2",
"name": "Terms per sentence",
"short_name": "tps",
"default": "3",
"description": "Terms per sentence",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "terms_per_sentence",
"parameter": true,
"order": 3,
"uid": "f1961c66-9680-4026-ae6f-ef47675aa31c"
}
},
{
"model": "workflows.abstractoutput",
"fields": {
"widget": "09d72ebd-2eaf-47f6-95e0-2f2ed0c51ea2",
"name": "Sentences",
"short_name": "sen",
"description": "Definition sentences",
"variable": "sentences",
"order": 1,
"uid": "67c0eff2-44cf-42cf-901a-94b4e3571d84"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "Between all",
"uid": "01f5514e-e113-42c2-b51c-5c5601dbb0f1",
"value": "all",
"abstract_input": "c69d46ff-b14b-4d16-b231-7c8ad6ca6797"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "1",
"uid": "11038cd5-4ab3-48dd-adbb-b93b6bc7a73d",
"value": "1",
"abstract_input": "cb415d1d-10c0-4a6e-94bc-bea998b920b5"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "English",
"uid": "4579b5c5-c3a2-4464-860f-9b5201e98c8b",
"value": "en",
"abstract_input": "b8b9fe35-d7be-46fb-953a-95a08b84b875"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "Only between first two terms",
"uid": "4d00af9b-53af-43bf-aae5-1c156adbf577",
"value": "two",
"abstract_input": "c69d46ff-b14b-4d16-b231-7c8ad6ca6797"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "None",
"uid": "63bcfec4-daf4-469d-8321-98862022035c",
"value": "none",
"abstract_input": "c69d46ff-b14b-4d16-b231-7c8ad6ca6797"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "2",
"uid": "b9645b99-3fbd-4ed7-8a44-1c757a8408e9",
"value": "2",
"abstract_input": "cb415d1d-10c0-4a6e-94bc-bea998b920b5"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "0",
"uid": "bd1bb39b-a730-4d62-8d62-6d46eecd356b",
"value": "0",
"abstract_input": "cb415d1d-10c0-4a6e-94bc-bea998b920b5"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "Slovene",
"uid": "d2d95064-d617-41b2-8a43-2f2203402c99",
"value": "sl",
"abstract_input": "b8b9fe35-d7be-46fb-953a-95a08b84b875"
}
}
]
\ No newline at end of file
[
{
"model": "workflows.abstractwidget",
"fields": {
"category": "9a30eafc-37b8-48f2-8a92-692c4b324dff",
"treeview_image": "",
"uid": "1e75106f-ba4c-4b2b-a8cf-fb0f9c1d42f8",
"is_streaming": false,
"package": "nlp",
"interaction_view": "",
"has_progress_bar": false,
"image": "",
"description": "",
"static_image": "nlp.png",
"action": "nlp_def_extraction_wnet2",
"visualization_view": "",
"streaming_visualization_view": "",
"post_interact_action": "",
"wsdl_method": "",
"wsdl": "",
"interactive": false,
"windows_queue": false,
"order": 1,
"name": "Definition extraction by wordnet2"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "1e75106f-ba4c-4b2b-a8cf-fb0f9c1d42f8",
"name": "Annotations",
"short_name": "ann",
"default": "",
"description": "Totrtale annotations",
"required": true,
"multi": false,
"parameter_type": null,
"variable": "annotations",
"parameter": false,
"order": 1,
"uid": "0ade20ab-9c3e-4e7a-8361-12a1176ad8ae"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "1e75106f-ba4c-4b2b-a8cf-fb0f9c1d42f8",
"name": "Language",
"short_name": "lan",
"default": "sl",
"description": "",
"required": false,
"multi": false,
"parameter_type": "select",
"variable": "lang",
"parameter": true,
"order": 1,
"uid": "5c66e5c4-b226-49c0-b0b6-aaf7567651df"
}
},
{
"model": "workflows.abstractoutput",
"fields": {
"widget": "1e75106f-ba4c-4b2b-a8cf-fb0f9c1d42f8",
"name": "Sentences",
"short_name": "sen",
"description": "Definition sentences",
"variable": "sentences",
"order": 1,
"uid": "995475da-5764-43cd-baa7-37bce6d28e91"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "English",
"uid": "37d0a55c-83f7-49d2-89b6-11fbc76eec4d",
"value": "en",
"abstract_input": "5c66e5c4-b226-49c0-b0b6-aaf7567651df"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "Slovene",
"uid": "5e340b6a-14a8-42c7-a80a-d75a08ac570f",
"value": "sl",
"abstract_input": "5c66e5c4-b226-49c0-b0b6-aaf7567651df"
}
}
]
\ No newline at end of file
......@@ -64,7 +64,7 @@
"widget": "27273666-764d-458d-9513-0715ba2e6b4d",
"name": "Language",
"short_name": "lan",
"default": "en",
"default": "sl",
"description": "Language",
"required": true,
"multi": false,
......
[
{
"model": "workflows.abstractwidget",
"fields": {
"category": "15acb469-c510-44f0-8330-60bfe11a463c",
"treeview_image": "",
"uid": "4414ef0d-4dde-46ee-ba7b-12da93daef15",
"is_streaming": false,
"package": "nlp",
"interaction_view": "",
"has_progress_bar": false,
"image": "",
"description": "Term extraction from totrtale annotations.",
"static_image": "nlp.png",
"action": "nlp_term_extraction2",
"visualization_view": "",
"streaming_visualization_view": "",
"post_interact_action": "",
"wsdl_method": "",
"wsdl": "",
"interactive": false,
"windows_queue": false,
"order": 1,
"name": "Term extraction2"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "4414ef0d-4dde-46ee-ba7b-12da93daef15",
"name": "Slovene reference corpus",
"short_name": "src",
"default": "kres",
"description": "",
"required": true,
"multi": false,
"parameter_type": "select",
"variable": "slovene_reference_corpus",
"parameter": true,
"order": 2,
"uid": "073abc3f-1d53-4b42-9d52-2bf9521b3a5c"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "4414ef0d-4dde-46ee-ba7b-12da93daef15",