Commit 34b5f041 authored by hiphop's avatar hiphop

load tagged corpus

parent 6a990d32
import nlp
import os.path
import os
import base64
from services.webservice import WebService
from workflows.security import safeOpen
......@@ -7,6 +7,7 @@ from requests import post
import json
import re
import itertools
import subprocess
def definition_sentences2(input_dict):
return {}
......@@ -114,14 +115,116 @@ def load_corpus2(input_dict):
"""
return {'corpus': content[u"resp"]}
def parse_xml(path, lemma_name = "lemma", pos_name = "ana", word_tag = "w", sentence_tag = "s"):
from xml.dom import minidom
fname = os.path.basename(path)
xmldoc = minidom.parse(path)
sentences = xmldoc.getElementsByTagName(sentence_tag)
tab_separated_output = []
head = "<TEXT title="+fname+">\t\n"
foot = "</TEXT>\t\n"
tab_separated_output.append(head)
sentence_id = 0
for sentece in sentences:
line = "\t<S id=\"0_" +str(sentence_id) + "\">\t\n"
tab_separated_output.append(line)
for s in sentece.getElementsByTagName(word_tag):
line = s.childNodes[0].nodeValue + "\tTOK\t" + s.attributes[lemma_name].value + "\t" + s.attributes[pos_name].value + "\t\n"
tab_separated_output.append(line)
line = "\t</S>\t\n"
tab_separated_output.append(line)
sentence_id +=1
tab_separated_output.append(foot)
return "".join(tab_separated_output).encode("utf8", "ignore")
def parse_tab_separated(path, word_index, token_index, lemma_index, pos_index, start_tag, end_tag, separator):
fname = os.path.basename(path)
f = safeOpen(path)
data = []
head = "<TEXT title="+fname+">\t\n"
foot = "</TEXT>\t\n"
data.append(head)
sentence_counter = 0
for line in f:
splitted_line = re.split(separator, line.strip())#.split(separator)
if len(splitted_line) >= 4:
new_line = splitted_line[word_index] + "\t" + splitted_line[token_index] + "\t" + splitted_line[lemma_index] + "\t" + splitted_line[pos_index] + "\t\n"
data.append(new_line)
else:
added = False
for el in splitted_line:
if re.match(start_tag, el.strip()):
data.append("\t<S id=\"0_" + str(sentence_counter)+"\">\t\n")
added = True
break
elif re.match(end_tag, el.strip()):
data.append("\t</S>\t\n")
sentence_counter+=1
added = True
break
if not added:
data.append("\t".join(splitted_line + ["\t\n"]))
data.append(foot)
return "".join(data)
def load_tagged_corpus(input_dict):
"""
Loads TEI file, which is output of totrtale
"""
f = safeOpen(input_dict['file'])
#fname = os.path.basename(input_dict['file'])
#subprocess.call(["java -jar jing.jar tei_imp.rng " + fname + " >" + "out.txt"],shell=True)
data = f.read()
data = ""
if input_dict["input_format"] == "tab_format":
try:
word_index = int(input_dict["word_index"]) - 1
lemma_index = int(input_dict["lemma_index"]) - 1
token_index = int(input_dict["token_index"]) - 1
pos_index = int(input_dict["pos_index"]) - 1
except ValueError:
raise Exception("Please specify a number in index fields.")
start_tag = input_dict["start_tag"]
end_tag = input_dict["end_tag"]
separator = input_dict["separator"]
if len(start_tag) < 1 or len(end_tag) < 1 or len(separator) < 1:
raise Exception("Please review start, end tag and separator parameters.")
if word_index+1 == 1 and token_index+1 == 2 and lemma_index+1 == 3 and pos_index+1 == 4 and start_tag == u'<S>' and end_tag == '</S>':
f = safeOpen(input_dict['file'])
data = f.read()
else:
if len(set([word_index, lemma_index, token_index, pos_index])) != 4:
raise Exception("Field indices should be distinct.")
data = parse_tab_separated(input_dict['file'], word_index=word_index, token_index=token_index, lemma_index=lemma_index, pos_index=pos_index, start_tag=start_tag, end_tag=end_tag, separator=separator)
else:
#fname = os.path.basename(input_dict['file'])
#data = f.read()
#path = os.path.dirname(os.path.abspath(__file__)) + os.sep
#subprocess.call(["java -jar " + path+"jing.jar " + path+ "tei_imp.rng <" + data + " >" + "out.txt"],shell=True)
#f = open("out.txt", "r")
#error = f.read()
#if len(error) > 0:
# raise Exception(error)
lemma_name = input_dict["lemma_name"]
pos_name = input_dict["pos_name"]
sentence_tag = input_dict["sentence_tag"]
word_tag = input_dict["word_tag"]
if len(lemma_name) < 1 or len(pos_name) < 1 or len(sentence_tag) < 1 or len(word_tag) < 1:
raise Exception("Please review parameters for TEI format.")
data = parse_xml(input_dict['file'], lemma_name = lemma_name, pos_name = pos_name, word_tag = word_tag, sentence_tag = sentence_tag)
return {'annotations': data}
def totrtale_request(params):
......@@ -545,6 +648,7 @@ def TEItoTab(text, doc_id=0):
choice_found=False #if lang in ["gaji", "boho"]
local_s=""
for l in text.splitlines():
print l
if "<choice>" in l:
choice_found=True
......
......@@ -11,7 +11,7 @@
"has_progress_bar": false,
"image": "",
"description": "",
"static_image": "",
"static_image": "nlp.png",
"action": "load_tagged_corpus",
"visualization_view": "",
"streaming_visualization_view": "",
......@@ -24,6 +24,244 @@
"name": "Load tagged corpus"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "TEI format settings",
"short_name": "tfs",
"default": "",
"description": "",
"required": false,
"multi": false,
"parameter_type": null,
"variable": "tfs",
"parameter": true,
"order": 4,
"uid": "1c044e6d-454f-41fd-85dd-26402d16c306"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Lemma name",
"short_name": "lmn",
"default": "lemma",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "lemma_name",
"parameter": true,
"order": 5,
"uid": "23738c56-2b0e-4f49-92f7-ea3805e082e6"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Start tag",
"short_name": "stg",
"default": "<S>",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "start_tag",
"parameter": true,
"order": 14,
"uid": "35a13690-c7e2-424c-b724-2328b0701d91"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Sentence tag",
"short_name": "sen",
"default": "s",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "sentence_tag",
"parameter": true,
"order": 7,
"uid": "463c90ce-3865-4b7f-8bd1-22c8f37f737a"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "POS index",
"short_name": "psi",
"default": "4",
"description": "Part of speech index.",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "pos_index",
"parameter": true,
"order": 13,
"uid": "5861150d-3e9c-4fcb-b08d-55e8ba3b99ea"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Word tag",
"short_name": "wtg",
"default": "w",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "word_tag",
"parameter": true,
"order": 8,
"uid": "5ed28783-37ed-4790-8cba-7edeac35a1ba"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "End tag",
"short_name": "etg",
"default": "</S>",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "end_tag",
"parameter": true,
"order": 15,
"uid": "7198fadf-38a0-4565-9bcc-6d11ccfd891f"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Input format",
"short_name": "ifr",
"default": "tei_format",
"description": "",
"required": true,
"multi": false,
"parameter_type": "select",
"variable": "input_format",
"parameter": true,
"order": 3,
"uid": "7e7ae32a-9512-479b-8c73-7f12b9312183"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Tab separated format settings",
"short_name": "tss",
"default": "",
"description": "",
"required": false,
"multi": false,
"parameter_type": null,
"variable": "tss",
"parameter": true,
"order": 9,
"uid": "81b91f82-535b-4d00-9970-a95a4ee1727f"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Separator",
"short_name": "sep",
"default": "\\t",
"description": "Define regex expression",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "separator",
"parameter": true,
"order": 20,
"uid": "83f4d974-0cee-4a2e-bddf-18f97d9d19ba"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Token index",
"short_name": "tki",
"default": "2",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "token_index",
"parameter": true,
"order": 11,
"uid": "88714bc1-4489-40a7-b601-ada613d1fa99"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "POS name",
"short_name": "psn",
"default": "ana",
"description": "Part of speech tag",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "pos_name",
"parameter": true,
"order": 6,
"uid": "9f796b5c-6093-494a-abdd-46cfae74a90d"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Lemma index",
"short_name": "lmi",
"default": "3",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "lemma_index",
"parameter": true,
"order": 12,
"uid": "a9420c84-1f6d-430a-8370-77a6d2b7559b"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Word index",
"short_name": "wdi",
"default": "1",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "word_index",
"parameter": true,
"order": 10,
"uid": "e5371a20-c9bd-44d8-a351-e40ad697fc79"
}
},
{
"model": "workflows.abstractinput",
"fields": {
......@@ -52,5 +290,23 @@
"order": 1,
"uid": "009be1e0-8ce5-4a33-be87-c9b38b25b192"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "Tab separated format",
"uid": "473ec8d3-15ad-4097-86b3-3f7078fbaa91",
"value": "tab_format",
"abstract_input": "7e7ae32a-9512-479b-8c73-7f12b9312183"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "TEI format",
"uid": "ec1260b0-1adc-4ef2-a2aa-1806393e2490",
"value": "tei_format",
"abstract_input": "7e7ae32a-9512-479b-8c73-7f12b9312183"
}
}
]
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment