Création d'un compte pour un collaborateur extérieur au laboratoire depuis l'intranet ICube : https://intranet.icube.unistra.fr/fr/labs/member/profile

Commit 34b5f041 authored by hiphop's avatar hiphop
Browse files

load tagged corpus

parent 6a990d32
import nlp
import os.path
import os
import base64
from services.webservice import WebService
from workflows.security import safeOpen
......@@ -7,6 +7,7 @@ from requests import post
import json
import re
import itertools
import subprocess
def definition_sentences2(input_dict):
return {}
......@@ -114,14 +115,116 @@ def load_corpus2(input_dict):
"""
return {'corpus': content[u"resp"]}
def parse_xml(path, lemma_name = "lemma", pos_name = "ana", word_tag = "w", sentence_tag = "s"):
from xml.dom import minidom
fname = os.path.basename(path)
xmldoc = minidom.parse(path)
sentences = xmldoc.getElementsByTagName(sentence_tag)
tab_separated_output = []
head = "<TEXT title="+fname+">\t\n"
foot = "</TEXT>\t\n"
tab_separated_output.append(head)
sentence_id = 0
for sentece in sentences:
line = "\t<S id=\"0_" +str(sentence_id) + "\">\t\n"
tab_separated_output.append(line)
for s in sentece.getElementsByTagName(word_tag):
line = s.childNodes[0].nodeValue + "\tTOK\t" + s.attributes[lemma_name].value + "\t" + s.attributes[pos_name].value + "\t\n"
tab_separated_output.append(line)
line = "\t</S>\t\n"
tab_separated_output.append(line)
sentence_id +=1
tab_separated_output.append(foot)
return "".join(tab_separated_output).encode("utf8", "ignore")
def parse_tab_separated(path, word_index, token_index, lemma_index, pos_index, start_tag, end_tag, separator):
fname = os.path.basename(path)
f = safeOpen(path)
data = []
head = "<TEXT title="+fname+">\t\n"
foot = "</TEXT>\t\n"
data.append(head)
sentence_counter = 0
for line in f:
splitted_line = re.split(separator, line.strip())#.split(separator)
if len(splitted_line) >= 4:
new_line = splitted_line[word_index] + "\t" + splitted_line[token_index] + "\t" + splitted_line[lemma_index] + "\t" + splitted_line[pos_index] + "\t\n"
data.append(new_line)
else:
added = False
for el in splitted_line:
if re.match(start_tag, el.strip()):
data.append("\t<S id=\"0_" + str(sentence_counter)+"\">\t\n")
added = True
break
elif re.match(end_tag, el.strip()):
data.append("\t</S>\t\n")
sentence_counter+=1
added = True
break
if not added:
data.append("\t".join(splitted_line + ["\t\n"]))
data.append(foot)
return "".join(data)
def load_tagged_corpus(input_dict):
"""
Loads TEI file, which is output of totrtale
"""
f = safeOpen(input_dict['file'])
#fname = os.path.basename(input_dict['file'])
#subprocess.call(["java -jar jing.jar tei_imp.rng " + fname + " >" + "out.txt"],shell=True)
data = f.read()
data = ""
if input_dict["input_format"] == "tab_format":
try:
word_index = int(input_dict["word_index"]) - 1
lemma_index = int(input_dict["lemma_index"]) - 1
token_index = int(input_dict["token_index"]) - 1
pos_index = int(input_dict["pos_index"]) - 1
except ValueError:
raise Exception("Please specify a number in index fields.")
start_tag = input_dict["start_tag"]
end_tag = input_dict["end_tag"]
separator = input_dict["separator"]
if len(start_tag) < 1 or len(end_tag) < 1 or len(separator) < 1:
raise Exception("Please review start, end tag and separator parameters.")
if word_index+1 == 1 and token_index+1 == 2 and lemma_index+1 == 3 and pos_index+1 == 4 and start_tag == u'<S>' and end_tag == '</S>':
f = safeOpen(input_dict['file'])
data = f.read()
else:
if len(set([word_index, lemma_index, token_index, pos_index])) != 4:
raise Exception("Field indices should be distinct.")
data = parse_tab_separated(input_dict['file'], word_index=word_index, token_index=token_index, lemma_index=lemma_index, pos_index=pos_index, start_tag=start_tag, end_tag=end_tag, separator=separator)
else:
#fname = os.path.basename(input_dict['file'])
#data = f.read()
#path = os.path.dirname(os.path.abspath(__file__)) + os.sep
#subprocess.call(["java -jar " + path+"jing.jar " + path+ "tei_imp.rng <" + data + " >" + "out.txt"],shell=True)
#f = open("out.txt", "r")
#error = f.read()
#if len(error) > 0:
# raise Exception(error)
lemma_name = input_dict["lemma_name"]
pos_name = input_dict["pos_name"]
sentence_tag = input_dict["sentence_tag"]
word_tag = input_dict["word_tag"]
if len(lemma_name) < 1 or len(pos_name) < 1 or len(sentence_tag) < 1 or len(word_tag) < 1:
raise Exception("Please review parameters for TEI format.")
data = parse_xml(input_dict['file'], lemma_name = lemma_name, pos_name = pos_name, word_tag = word_tag, sentence_tag = sentence_tag)
return {'annotations': data}
def totrtale_request(params):
......@@ -545,6 +648,7 @@ def TEItoTab(text, doc_id=0):
choice_found=False #if lang in ["gaji", "boho"]
local_s=""
for l in text.splitlines():
print l
if "<choice>" in l:
choice_found=True
......
......@@ -11,7 +11,7 @@
"has_progress_bar": false,
"image": "",
"description": "",
"static_image": "",
"static_image": "nlp.png",
"action": "load_tagged_corpus",
"visualization_view": "",
"streaming_visualization_view": "",
......@@ -24,6 +24,244 @@
"name": "Load tagged corpus"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "TEI format settings",
"short_name": "tfs",
"default": "",
"description": "",
"required": false,
"multi": false,
"parameter_type": null,
"variable": "tfs",
"parameter": true,
"order": 4,
"uid": "1c044e6d-454f-41fd-85dd-26402d16c306"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Lemma name",
"short_name": "lmn",
"default": "lemma",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "lemma_name",
"parameter": true,
"order": 5,
"uid": "23738c56-2b0e-4f49-92f7-ea3805e082e6"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Start tag",
"short_name": "stg",
"default": "<S>",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "start_tag",
"parameter": true,
"order": 14,
"uid": "35a13690-c7e2-424c-b724-2328b0701d91"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Sentence tag",
"short_name": "sen",
"default": "s",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "sentence_tag",
"parameter": true,
"order": 7,
"uid": "463c90ce-3865-4b7f-8bd1-22c8f37f737a"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "POS index",
"short_name": "psi",
"default": "4",
"description": "Part of speech index.",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "pos_index",
"parameter": true,
"order": 13,
"uid": "5861150d-3e9c-4fcb-b08d-55e8ba3b99ea"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Word tag",
"short_name": "wtg",
"default": "w",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "word_tag",
"parameter": true,
"order": 8,
"uid": "5ed28783-37ed-4790-8cba-7edeac35a1ba"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "End tag",
"short_name": "etg",
"default": "</S>",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "end_tag",
"parameter": true,
"order": 15,
"uid": "7198fadf-38a0-4565-9bcc-6d11ccfd891f"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Input format",
"short_name": "ifr",
"default": "tei_format",
"description": "",
"required": true,
"multi": false,
"parameter_type": "select",
"variable": "input_format",
"parameter": true,
"order": 3,
"uid": "7e7ae32a-9512-479b-8c73-7f12b9312183"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Tab separated format settings",
"short_name": "tss",
"default": "",
"description": "",
"required": false,
"multi": false,
"parameter_type": null,
"variable": "tss",
"parameter": true,
"order": 9,
"uid": "81b91f82-535b-4d00-9970-a95a4ee1727f"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Separator",
"short_name": "sep",
"default": "\\t",
"description": "Define regex expression",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "separator",
"parameter": true,
"order": 20,
"uid": "83f4d974-0cee-4a2e-bddf-18f97d9d19ba"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Token index",
"short_name": "tki",
"default": "2",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "token_index",
"parameter": true,
"order": 11,
"uid": "88714bc1-4489-40a7-b601-ada613d1fa99"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "POS name",
"short_name": "psn",
"default": "ana",
"description": "Part of speech tag",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "pos_name",
"parameter": true,
"order": 6,
"uid": "9f796b5c-6093-494a-abdd-46cfae74a90d"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Lemma index",
"short_name": "lmi",
"default": "3",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "lemma_index",
"parameter": true,
"order": 12,
"uid": "a9420c84-1f6d-430a-8370-77a6d2b7559b"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Word index",
"short_name": "wdi",
"default": "1",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "word_index",
"parameter": true,
"order": 10,
"uid": "e5371a20-c9bd-44d8-a351-e40ad697fc79"
}
},
{
"model": "workflows.abstractinput",
"fields": {
......@@ -52,5 +290,23 @@
"order": 1,
"uid": "009be1e0-8ce5-4a33-be87-c9b38b25b192"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "Tab separated format",
"uid": "473ec8d3-15ad-4097-86b3-3f7078fbaa91",
"value": "tab_format",
"abstract_input": "7e7ae32a-9512-479b-8c73-7f12b9312183"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "TEI format",
"uid": "ec1260b0-1adc-4ef2-a2aa-1806393e2490",
"value": "tei_format",
"abstract_input": "7e7ae32a-9512-479b-8c73-7f12b9312183"
}
}
]
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment