Commit 796ebe50 authored by romanorac's avatar romanorac

new totrtale

parent 8146c53d
......@@ -3,7 +3,12 @@ import os.path
import base64
from services.webservice import WebService
from workflows.security import safeOpen
import requests
import json
import re
import itertools
webservices_url = "http://vihar.ijs.si:8104"
def merge_sentences(input_dict):
"""
......@@ -47,6 +52,70 @@ def load_corpus(input_dict):
response = ws.client.parseFile(fileName=fname, inFile=data)
return {'corpus': response['parsedFile']}
def load_corpus2(input_dict):
'''
Parses an input file and encodes it in base 64.
'''
if input_dict[u"text"] == "":
f = safeOpen(input_dict['file'])
fname = os.path.basename(input_dict['file'])
data = base64.b64encode(f.read())
else:
fname = "input_string.txt"
data = base64.b64encode(input_dict[u"text"].strip())
#define web service
webservice_url = webservices_url + "/parseFile"
params = {"filename": fname, "text": data} #set params
#call web service
resp = requests.post(webservice_url, params=params)
content = json.loads(resp.content)[u'parseFileResponse'][u'parseFileResult']
if content[u"error"] != "":
raise Exception(content[u"error"])
else:
return {'corpus': content[u"resp"]}
def load_tagged_corpus(input_dict):
"""
Loads TEI file, which is output of totrtale
"""
f = safeOpen(input_dict['file'])
#fname = os.path.basename(input_dict['file'])
#subprocess.call(["java -jar jing.jar tei_imp.rng " + fname + " >" + "out.txt"],shell=True)
data = f.read()
return {'annotations': data}
def nlp_totrtale2(input_dict):
'''
Calls the totrtale web service.
'''
corpus = input_dict['corpus']
lang = input_dict['lang']
xml = input_dict['xml']
postprocess = input_dict['postprocess']
bohoricica = input_dict['bohoricica']
antique = input_dict['antique']
#define web service
webservice_url = webservices_url + "/runToTrTaLe"
params = {"text":corpus, "language": lang, "postProcessing":postprocess, "bohoricica":bohoricica, "antique": antique, "outputAsXML":xml}
import time
start = time.time()
response = requests.post(webservice_url, params = params)
content = json.loads(response.content)
if u'runToTrTaLeResponse' in content:
content = content[u'runToTrTaLeResponse'][u'runToTrTaLeResult']
else:
content = content[u"error"]
end = time.time()
print "ToTrTale execution time was ", end - start
return {'annotations': content[u'resp']}
def nlp_totrtale(input_dict):
'''
......@@ -80,6 +149,10 @@ def nlp_term_extraction(input_dict):
annotations = input_dict['annotations']
lang = input_dict['lang']
wsdl = input_dict.get('wsdl', 'http://vihar.ijs.si:8095/totale?wsdl')
if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
annotations = XMLtoTEI(annotations)
ws = WebService(wsdl, 60000)
response = ws.client.TermExtraction(corpus=annotations, lang=lang,
threshold=0)
......@@ -93,6 +166,10 @@ def nlp_def_extraction_patterns(input_dict):
annotations = input_dict['annotations']
lang = input_dict['lang']
wsdl = input_dict.get('wsdl', 'http://vihar.ijs.si:8099')
if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
annotations = XMLtoTEI(annotations)
ws = WebService(wsdl, 60000)
pattern = input_dict['pattern']
response = ws.client.GlossaryExtractionByPatterns(corpus=annotations,
......@@ -115,6 +192,10 @@ def nlp_def_extraction_terms(input_dict):
multiword_term = input_dict['multiword_term']
num_multiterms = input_dict['num_multiterms']
term_beginning = input_dict['term_beginning']
if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
annotations = XMLtoTEI(annotations)
ws = WebService(wsdl, 60000)
response = ws.client.GlossaryExtractionByTerms(corpus=annotations,
candidates=term_candidates, lang=lang, nominatives=nominatives,
......@@ -131,6 +212,36 @@ def nlp_def_extraction_wnet(input_dict):
annotations = input_dict['annotations']
lang = input_dict['lang']
wsdl = input_dict.get('wsdl', 'http://vihar.ijs.si:8099')
if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
annotations = XMLtoTEI(annotations)
ws = WebService(wsdl, 60000)
response = ws.client.GlossaryExtractionByWnet(corpus=annotations, lang=lang)
return {'sentences': response['candidates']}
def XMLtoTEI(text):
mask1 = ["\tTOK\t", "\t", "\t\n"]
pattern1 = "<w lemma=\"(?P<lemma>.*?)\" ana=\"(?P<ana>.*?)\">(?P<value>.*?)</w>"
pattern2 = "<title>(.*?)</title>"
pattern3 = "<pc>(.*?)</pc>"
newText=[]
for l in text.splitlines():
if "<w" in l:
match = [m.group("value", "lemma", "ana") for m in re.finditer(pattern1, l)][0]
newText.append(''.join(itertools.chain.from_iterable(zip(match, mask1))).decode("utf8"))
elif "</s>" in l:
newText.append("\t\t<S/>\t\n")
elif "<pc>" in l:
value = re.findall(pattern3, l)[0]
if value == ".":
newText.append(value+"\t\tPUN_TERM\t\n")
else:
newText.append(value+"\t\tPUN\t\n")
elif "<title>" in l:
title = re.findall(pattern2, l)[0]
newText.append("<TEXT title=" + title + ">\t\n")
elif "</body>" in l:
newText.append("</TEXT>\t\n")
return "".join(newText)
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment