Commit 34b5f041 authored by hiphop's avatar hiphop

load tagged corpus

parent 6a990d32
import nlp import nlp
import os.path import os
import base64 import base64
from services.webservice import WebService from services.webservice import WebService
from workflows.security import safeOpen from workflows.security import safeOpen
...@@ -7,6 +7,7 @@ from requests import post ...@@ -7,6 +7,7 @@ from requests import post
import json import json
import re import re
import itertools import itertools
import subprocess
def definition_sentences2(input_dict): def definition_sentences2(input_dict):
return {} return {}
...@@ -114,14 +115,116 @@ def load_corpus2(input_dict): ...@@ -114,14 +115,116 @@ def load_corpus2(input_dict):
""" """
return {'corpus': content[u"resp"]} return {'corpus': content[u"resp"]}
def parse_xml(path, lemma_name = "lemma", pos_name = "ana", word_tag = "w", sentence_tag = "s"):
from xml.dom import minidom
fname = os.path.basename(path)
xmldoc = minidom.parse(path)
sentences = xmldoc.getElementsByTagName(sentence_tag)
tab_separated_output = []
head = "<TEXT title="+fname+">\t\n"
foot = "</TEXT>\t\n"
tab_separated_output.append(head)
sentence_id = 0
for sentece in sentences:
line = "\t<S id=\"0_" +str(sentence_id) + "\">\t\n"
tab_separated_output.append(line)
for s in sentece.getElementsByTagName(word_tag):
line = s.childNodes[0].nodeValue + "\tTOK\t" + s.attributes[lemma_name].value + "\t" + s.attributes[pos_name].value + "\t\n"
tab_separated_output.append(line)
line = "\t</S>\t\n"
tab_separated_output.append(line)
sentence_id +=1
tab_separated_output.append(foot)
return "".join(tab_separated_output).encode("utf8", "ignore")
def parse_tab_separated(path, word_index, token_index, lemma_index, pos_index, start_tag, end_tag, separator):
fname = os.path.basename(path)
f = safeOpen(path)
data = []
head = "<TEXT title="+fname+">\t\n"
foot = "</TEXT>\t\n"
data.append(head)
sentence_counter = 0
for line in f:
splitted_line = re.split(separator, line.strip())#.split(separator)
if len(splitted_line) >= 4:
new_line = splitted_line[word_index] + "\t" + splitted_line[token_index] + "\t" + splitted_line[lemma_index] + "\t" + splitted_line[pos_index] + "\t\n"
data.append(new_line)
else:
added = False
for el in splitted_line:
if re.match(start_tag, el.strip()):
data.append("\t<S id=\"0_" + str(sentence_counter)+"\">\t\n")
added = True
break
elif re.match(end_tag, el.strip()):
data.append("\t</S>\t\n")
sentence_counter+=1
added = True
break
if not added:
data.append("\t".join(splitted_line + ["\t\n"]))
data.append(foot)
return "".join(data)
def load_tagged_corpus(input_dict): def load_tagged_corpus(input_dict):
""" """
Loads TEI file, which is output of totrtale Loads TEI file, which is output of totrtale
""" """
f = safeOpen(input_dict['file']) data = ""
#fname = os.path.basename(input_dict['file'])
#subprocess.call(["java -jar jing.jar tei_imp.rng " + fname + " >" + "out.txt"],shell=True)
data = f.read() if input_dict["input_format"] == "tab_format":
try:
word_index = int(input_dict["word_index"]) - 1
lemma_index = int(input_dict["lemma_index"]) - 1
token_index = int(input_dict["token_index"]) - 1
pos_index = int(input_dict["pos_index"]) - 1
except ValueError:
raise Exception("Please specify a number in index fields.")
start_tag = input_dict["start_tag"]
end_tag = input_dict["end_tag"]
separator = input_dict["separator"]
if len(start_tag) < 1 or len(end_tag) < 1 or len(separator) < 1:
raise Exception("Please review start, end tag and separator parameters.")
if word_index+1 == 1 and token_index+1 == 2 and lemma_index+1 == 3 and pos_index+1 == 4 and start_tag == u'<S>' and end_tag == '</S>':
f = safeOpen(input_dict['file'])
data = f.read()
else:
if len(set([word_index, lemma_index, token_index, pos_index])) != 4:
raise Exception("Field indices should be distinct.")
data = parse_tab_separated(input_dict['file'], word_index=word_index, token_index=token_index, lemma_index=lemma_index, pos_index=pos_index, start_tag=start_tag, end_tag=end_tag, separator=separator)
else:
#fname = os.path.basename(input_dict['file'])
#data = f.read()
#path = os.path.dirname(os.path.abspath(__file__)) + os.sep
#subprocess.call(["java -jar " + path+"jing.jar " + path+ "tei_imp.rng <" + data + " >" + "out.txt"],shell=True)
#f = open("out.txt", "r")
#error = f.read()
#if len(error) > 0:
# raise Exception(error)
lemma_name = input_dict["lemma_name"]
pos_name = input_dict["pos_name"]
sentence_tag = input_dict["sentence_tag"]
word_tag = input_dict["word_tag"]
if len(lemma_name) < 1 or len(pos_name) < 1 or len(sentence_tag) < 1 or len(word_tag) < 1:
raise Exception("Please review parameters for TEI format.")
data = parse_xml(input_dict['file'], lemma_name = lemma_name, pos_name = pos_name, word_tag = word_tag, sentence_tag = sentence_tag)
return {'annotations': data} return {'annotations': data}
def totrtale_request(params): def totrtale_request(params):
...@@ -545,6 +648,7 @@ def TEItoTab(text, doc_id=0): ...@@ -545,6 +648,7 @@ def TEItoTab(text, doc_id=0):
choice_found=False #if lang in ["gaji", "boho"] choice_found=False #if lang in ["gaji", "boho"]
local_s="" local_s=""
for l in text.splitlines(): for l in text.splitlines():
print l
if "<choice>" in l: if "<choice>" in l:
choice_found=True choice_found=True
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
"has_progress_bar": false, "has_progress_bar": false,
"image": "", "image": "",
"description": "", "description": "",
"static_image": "", "static_image": "nlp.png",
"action": "load_tagged_corpus", "action": "load_tagged_corpus",
"visualization_view": "", "visualization_view": "",
"streaming_visualization_view": "", "streaming_visualization_view": "",
...@@ -24,6 +24,244 @@ ...@@ -24,6 +24,244 @@
"name": "Load tagged corpus" "name": "Load tagged corpus"
} }
}, },
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "TEI format settings",
"short_name": "tfs",
"default": "",
"description": "",
"required": false,
"multi": false,
"parameter_type": null,
"variable": "tfs",
"parameter": true,
"order": 4,
"uid": "1c044e6d-454f-41fd-85dd-26402d16c306"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Lemma name",
"short_name": "lmn",
"default": "lemma",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "lemma_name",
"parameter": true,
"order": 5,
"uid": "23738c56-2b0e-4f49-92f7-ea3805e082e6"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Start tag",
"short_name": "stg",
"default": "<S>",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "start_tag",
"parameter": true,
"order": 14,
"uid": "35a13690-c7e2-424c-b724-2328b0701d91"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Sentence tag",
"short_name": "sen",
"default": "s",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "sentence_tag",
"parameter": true,
"order": 7,
"uid": "463c90ce-3865-4b7f-8bd1-22c8f37f737a"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "POS index",
"short_name": "psi",
"default": "4",
"description": "Part of speech index.",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "pos_index",
"parameter": true,
"order": 13,
"uid": "5861150d-3e9c-4fcb-b08d-55e8ba3b99ea"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Word tag",
"short_name": "wtg",
"default": "w",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "word_tag",
"parameter": true,
"order": 8,
"uid": "5ed28783-37ed-4790-8cba-7edeac35a1ba"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "End tag",
"short_name": "etg",
"default": "</S>",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "end_tag",
"parameter": true,
"order": 15,
"uid": "7198fadf-38a0-4565-9bcc-6d11ccfd891f"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Input format",
"short_name": "ifr",
"default": "tei_format",
"description": "",
"required": true,
"multi": false,
"parameter_type": "select",
"variable": "input_format",
"parameter": true,
"order": 3,
"uid": "7e7ae32a-9512-479b-8c73-7f12b9312183"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Tab separated format settings",
"short_name": "tss",
"default": "",
"description": "",
"required": false,
"multi": false,
"parameter_type": null,
"variable": "tss",
"parameter": true,
"order": 9,
"uid": "81b91f82-535b-4d00-9970-a95a4ee1727f"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Separator",
"short_name": "sep",
"default": "\\t",
"description": "Define regex expression",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "separator",
"parameter": true,
"order": 20,
"uid": "83f4d974-0cee-4a2e-bddf-18f97d9d19ba"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Token index",
"short_name": "tki",
"default": "2",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "token_index",
"parameter": true,
"order": 11,
"uid": "88714bc1-4489-40a7-b601-ada613d1fa99"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "POS name",
"short_name": "psn",
"default": "ana",
"description": "Part of speech tag",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "pos_name",
"parameter": true,
"order": 6,
"uid": "9f796b5c-6093-494a-abdd-46cfae74a90d"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Lemma index",
"short_name": "lmi",
"default": "3",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "lemma_index",
"parameter": true,
"order": 12,
"uid": "a9420c84-1f6d-430a-8370-77a6d2b7559b"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "ca48d481-e00f-46d7-b8a6-b27a06a2e24a",
"name": "Word index",
"short_name": "wdi",
"default": "1",
"description": "",
"required": true,
"multi": false,
"parameter_type": "text",
"variable": "word_index",
"parameter": true,
"order": 10,
"uid": "e5371a20-c9bd-44d8-a351-e40ad697fc79"
}
},
{ {
"model": "workflows.abstractinput", "model": "workflows.abstractinput",
"fields": { "fields": {
...@@ -52,5 +290,23 @@ ...@@ -52,5 +290,23 @@
"order": 1, "order": 1,
"uid": "009be1e0-8ce5-4a33-be87-c9b38b25b192" "uid": "009be1e0-8ce5-4a33-be87-c9b38b25b192"
} }
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "Tab separated format",
"uid": "473ec8d3-15ad-4097-86b3-3f7078fbaa91",
"value": "tab_format",
"abstract_input": "7e7ae32a-9512-479b-8c73-7f12b9312183"
}
},
{
"model": "workflows.abstractoption",
"fields": {
"name": "TEI format",
"uid": "ec1260b0-1adc-4ef2-a2aa-1806393e2490",
"value": "tei_format",
"abstract_input": "7e7ae32a-9512-479b-8c73-7f12b9312183"
}
} }
] ]
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment