Création d'un compte pour un collaborateur extérieur au laboratoire depuis l'intranet ICube : https://intranet.icube.unistra.fr/fr/labs/member/profile

library.py 26.8 KB
Newer Older
1
import nlp
hiphop's avatar
hiphop committed
2
import os
3
import base64
4
from services.webservice import WebService
5
from workflows.security import safeOpen
hiphop's avatar
hiphop committed
6
from requests import post
romanorac's avatar
romanorac committed
7
8
9
import json
import re
import itertools
hiphop's avatar
hiphop committed
10
import subprocess
11

hiphop's avatar
hiphop committed
12
13
webservices_totrtale_url = "http://172.20.0.154/totrtale"
webservice_def_ex_url = "http://172.20.0.154/definition"
hiphop's avatar
hiphop committed
14

15
16
17
18
19
20
21
def merge_sentences(input_dict):
    """
    Merges the input sentences in XML according to the specified method.
    """
    method = input_dict['method']
    merged_sen, id_to_sent = set(), {}
    ids_list = []
22
    for i, sentsXML in enumerate(input_dict['sentences']):
23
24
25
26
27
28
        sents = nlp.parse_def_sentences(sentsXML)
        ids = set(map(lambda x: x['id'], sents))
        ids_list.append(ids)
        # Save the map from id to sentence
        for sent in sents:
            id_to_sent[sent['id']] = sent
29
        if i == 0 and method != 'intersection_two':
30
31
32
33
34
35
            merged_sen = ids
        if method == 'union':
            merged_sen = merged_sen | ids
        elif method == 'intersection':
            merged_sen = merged_sen & ids
        elif method == 'intersection_two':
36
37
38
39
40
            # Skip the current set of sentences
            # and intersect it with the others.
            for ids_alt in ids_list[:i] + ids_list[i+1:]:
                # As long as (at least) two sets agree with a sentence it 
                # will be in the resulting set.
41
                merged_sen = merged_sen | (ids_alt & ids)
42
43
    return {'merged_sentences': nlp.sentences_to_xml([id_to_sent[sid] for sid in merged_sen])}

hiphop's avatar
hiphop committed
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def merge_sentences2(input_dict):
    """
    Merges the input sentences in XML according to the specified method.
    """
    method = input_dict['method']
    merged_sen, id_to_sent = set(), {}
    ids_list = []
    for i, sentsXML in enumerate(input_dict['sentences']):
        sents = nlp.parse_def_sentences2(sentsXML)
        ids = set(map(lambda x: x['id'], sents))
        ids_list.append(ids)
        # Save the map from id to sentence
        for sent in sents:
            id_to_sent[sent['id']] = sent
        if i == 0 and method != 'intersection_two':
            merged_sen = ids
        if method == 'union':
            merged_sen = merged_sen | ids
        elif method == 'intersection':
            merged_sen = merged_sen & ids
        elif method == 'intersection_two':
            # Skip the current set of sentences
            # and intersect it with the others.
            for ids_alt in ids_list[:i] + ids_list[i+1:]:
                # As long as (at least) two sets agree with a sentence it 
                # will be in the resulting set.
                merged_sen = merged_sen | (ids_alt & ids)
    return {'merged_sentences': nlp.sentences_to_xml2([id_to_sent[sid] for sid in merged_sen])}

73
74
75
76
77
78
79

def load_corpus(input_dict):
    '''
    Parses an input file and encodes it in base 64.
    '''
    f = safeOpen(input_dict['file'])
    fname = os.path.basename(input_dict['file'])
80
    wsdl = input_dict.get('wsdl', 'http://vihar.ijs.si:8095/totale?wsdl')
81
    data = base64.b64encode(f.read())
82
83
84
    ws = WebService(wsdl, 60000)
    response = ws.client.parseFile(fileName=fname, inFile=data)
    return {'corpus': response['parsedFile']}
85

romanorac's avatar
romanorac committed
86
87
88
89
def load_corpus2(input_dict):
    '''
    Parses an input file and encodes it in base 64.
    '''
hiphop's avatar
hiphop committed
90
    use_text = input_dict["use_text"] == "true"
romanorac's avatar
romanorac committed
91

92
    if use_text: #checkbox is checked
hiphop's avatar
hiphop committed
93
94
95
96
97
        fname = "input_string.txt"
        text = input_dict[u"text"].strip()
        if len(text) == 0:
            raise Exception("Please input text or uncheck the Use text checkbox.")
        data = base64.b64encode(text)
98
    else: #checkbox is not checked
romanorac's avatar
romanorac committed
99
100
101
102
103
        f = safeOpen(input_dict['file'])
        fname = os.path.basename(input_dict['file'])
        data = base64.b64encode(f.read())
    
    #define web service
104
    webservice_url = webservices_totrtale_url + "/parseFile"
romanorac's avatar
romanorac committed
105
106
107
    params = {"filename": fname, "text": data} #set params
    
    #call web service
108
109
110
    #print webservice_url
    resp = post(webservice_url, data=params)
    #print resp.content
romanorac's avatar
romanorac committed
111
    content = json.loads(resp.content)[u'parseFileResponse'][u'parseFileResult']
hiphop's avatar
hiphop committed
112
    """
romanorac's avatar
romanorac committed
113
114
115
    if content[u"error"] != "":
        raise Exception(content[u"error"])
    else:
hiphop's avatar
hiphop committed
116
117
    """
    return {'corpus': content[u"resp"]}
romanorac's avatar
romanorac committed
118

119
120
121
122
def parse_tei(path, lemma_name = "lemma", pos_name = "ana", word_tag = "w", sentence_tag = "s"):
    """
    Helper function for load tagged corpus. Function parses TEI format.
    """
hiphop's avatar
hiphop committed
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
    from xml.dom import minidom

    fname = os.path.basename(path)
    xmldoc = minidom.parse(path)
    sentences = xmldoc.getElementsByTagName(sentence_tag)

    tab_separated_output = []
    head = "<TEXT title="+fname+">\t\n"
    foot = "</TEXT>\t\n"
    tab_separated_output.append(head)

    sentence_id = 0
    for sentece in sentences:
        line = "\t<S id=\"0_" +str(sentence_id) + "\">\t\n" 
        tab_separated_output.append(line)
        for s in sentece.getElementsByTagName(word_tag):
            line = s.childNodes[0].nodeValue + "\tTOK\t" + s.attributes[lemma_name].value + "\t" + s.attributes[pos_name].value + "\t\n"
            tab_separated_output.append(line)
        line = "\t</S>\t\n"
        tab_separated_output.append(line)
        sentence_id +=1
    tab_separated_output.append(foot)
    return  "".join(tab_separated_output).encode("utf8", "ignore")

def parse_tab_separated(path, word_index, token_index, lemma_index, pos_index, start_tag, end_tag, separator):
148
149
150
    """
    Helper function for load tagged corpus. Function parses tab separated format.
    """
hiphop's avatar
hiphop committed
151
152
153
154
155
156
157
158
159
160
161
    
    fname = os.path.basename(path)
    f = safeOpen(path)

    data = []
    head = "<TEXT title="+fname+">\t\n"
    foot = "</TEXT>\t\n"
    data.append(head)

    sentence_counter = 0
    for line in f:
162
        splitted_line = re.split(separator, line.strip())
hiphop's avatar
hiphop committed
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
        if len(splitted_line) >= 4:
            new_line = splitted_line[word_index] + "\t" + splitted_line[token_index] + "\t" + splitted_line[lemma_index] + "\t" + splitted_line[pos_index] + "\t\n"
            data.append(new_line)
        else:
            added = False
            for el in splitted_line:
                if re.match(start_tag, el.strip()):
                    data.append("\t<S id=\"0_" + str(sentence_counter)+"\">\t\n")
                    added = True
                    break
                elif re.match(end_tag, el.strip()):
                    data.append("\t</S>\t\n")
                    sentence_counter+=1
                    added = True
                    break
            if not added:
                data.append("\t".join(splitted_line + ["\t\n"]))
    data.append(foot)
    return "".join(data)

romanorac's avatar
romanorac committed
183
184
def load_tagged_corpus(input_dict):
    """
185
    Loads a file in TEI or XML format.
romanorac's avatar
romanorac committed
186
    """
hiphop's avatar
hiphop committed
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
    data = ""
    
    if input_dict["input_format"] == "tab_format":
        try:
            word_index = int(input_dict["word_index"]) - 1
            lemma_index = int(input_dict["lemma_index"]) - 1
            token_index = int(input_dict["token_index"]) - 1
            pos_index = int(input_dict["pos_index"]) - 1
        except ValueError:
            raise Exception("Please specify a number in index fields.")

        start_tag = input_dict["start_tag"]
        end_tag = input_dict["end_tag"]
        separator = input_dict["separator"]

        if len(start_tag) < 1 or len(end_tag) < 1 or len(separator) < 1:
            raise Exception("Please review start, end tag and separator parameters.")
        
        if word_index+1 == 1 and token_index+1 == 2 and lemma_index+1 == 3 and pos_index+1 == 4 and start_tag == u'<S>' and end_tag == '</S>':
            f = safeOpen(input_dict['file'])
            data = f.read()
        else:
            if len(set([word_index, lemma_index, token_index, pos_index])) != 4:
                raise Exception("Field indices should be distinct.")
            data = parse_tab_separated(input_dict['file'], word_index=word_index, token_index=token_index, lemma_index=lemma_index, pos_index=pos_index, start_tag=start_tag, end_tag=end_tag, separator=separator)

    else:
        lemma_name = input_dict["lemma_name"]
        pos_name = input_dict["pos_name"]
        sentence_tag = input_dict["sentence_tag"]
        word_tag = input_dict["word_tag"]

        if len(lemma_name) < 1 or len(pos_name) < 1 or len(sentence_tag) < 1 or len(word_tag) < 1:
            raise Exception("Please review parameters for TEI format.")

222
        data = parse_tei(input_dict['file'], lemma_name = lemma_name, pos_name = pos_name, word_tag = word_tag, sentence_tag = sentence_tag)
hiphop's avatar
hiphop committed
223

romanorac's avatar
romanorac committed
224
225
    return {'annotations': data}

hiphop's avatar
hiphop committed
226
def totrtale_request(params):
227
    webservice_url = webservices_totrtale_url + "/runToTrTaLe"
228
    return post(webservice_url, data=params)
hiphop's avatar
hiphop committed
229
230

def nlp_totrtale2(input_dict, widget):
romanorac's avatar
romanorac committed
231
232
    '''
    Calls the totrtale web service.
233
234

    Function splits huge documents in smaller pieces and sends them separatly to totrtale webservice. If there is multiple smaller documents, this functions groups them and sends them together.
romanorac's avatar
romanorac committed
235
    '''
hiphop's avatar
hiphop committed
236
237
238
239
    import multiprocessing
    from xml.dom.minidom import parseString
    import time
    import math
hiphop's avatar
hiphop committed
240
    import copy
romanorac's avatar
romanorac committed
241

242
243
    progress_accumulator = 0 #progress for progress bar
    widget.progress= progress_accumulator 
hiphop's avatar
hiphop committed
244
    widget.save()
romanorac's avatar
romanorac committed
245

246
247
248
    processes = 4 #number of processes for multiprocessing
    DOCUMENTS_SIZE = 3 * int(1e6) #size of a group of documents in MB per process
    SINGLE_DOC_SIZE = 1 * int(1e6) #size of a single document per process
hiphop's avatar
hiphop committed
249
    
hiphop's avatar
hiphop committed
250
251
252
253
    corpus = parseString(input_dict['corpus'])
    language = input_dict['lang'], 
    postprocess = input_dict['postprocess'] == "true"
    xml = input_dict['xml'] == "true"
hiphop's avatar
hiphop committed
254

hiphop's avatar
hiphop committed
255
256
257
    params = {"language": language, 
            "postprocess": postprocess, 
            "xml":xml}
hiphop's avatar
hiphop committed
258
             
hiphop's avatar
hiphop committed
259
260
261
262
263
264
265
266
267
268
269
    tei_corpus = corpus.getElementsByTagName('teiCorpus')
    if tei_corpus:
        tei_head = '<?xml version="1.0" encoding="utf-8"?>\n' + \
                   '<teiCorpus xmlns="http://www.tei-c.org/ns/1.0">\n'
        tei_header = corpus.getElementsByTagName('teiHeader')[0].toxml() + "\n"
        tei_tail = '</teiCorpus>'

    pool = multiprocessing.Pool(processes=processes)
    documents = corpus.getElementsByTagName('TEI')
    documents_size, document_num, process_num = 0, 0, 1

hiphop's avatar
hiphop committed
270
    results, docs, single_docs = [], [], []
hiphop's avatar
hiphop committed
271
    for i, document in enumerate(documents):
hiphop's avatar
hiphop committed
272
        doc_len = len(document.getElementsByTagName('body')[0].getElementsByTagName('p')[0].childNodes[0].nodeValue)
hiphop's avatar
hiphop committed
273
274
        doc_title = document.getElementsByTagName('title')[0].firstChild.nodeValue
        print doc_title
hiphop's avatar
hiphop committed
275
        if doc_len > SINGLE_DOC_SIZE:
276
            #split single huge document
hiphop's avatar
hiphop committed
277
            
hiphop's avatar
hiphop committed
278
            predhead = '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
hiphop's avatar
hiphop committed
279
            title = '<title>' + doc_title + '</title>\n'
hiphop's avatar
hiphop committed
280
281
282
283
284
285
            head = '<text>\n<body>\n<p>\n'
            header = document.getElementsByTagName('teiHeader')[0].toxml() + "\n"
            tail = '\n</p>\n</body>\n</text>\n</TEI>'
            

            document_text = document.getElementsByTagName('body')[0].getElementsByTagName('p')[0].childNodes[0].nodeValue.strip().replace("&","&amp;").replace("<","&lt;").replace(">","&gt;").replace("\"","&quot;")
hiphop's avatar
hiphop committed
286

hiphop's avatar
hiphop committed
287
288
289
290
291
            prev_j, curr_j  = 0, SINGLE_DOC_SIZE
            while (curr_j+2) < len(document_text):
                while (curr_j+2) < len(document_text) and document_text[curr_j:curr_j+2] != ". ":
                    curr_j+=1
                sub_params = copy.deepcopy(params)
hiphop's avatar
hiphop committed
292
293
294
295
                if prev_j == 0:
                    sub_params["text"] = predhead +title + head + document_text[prev_j: curr_j+2] +tail
                else:
                    sub_params["text"] = predhead + head + document_text[prev_j: curr_j+2] + tail
hiphop's avatar
hiphop committed
296
                sub_params["doc_id"] = str(len(results))
hiphop's avatar
hiphop committed
297
298
299
300
301
302
303
304
305
306
307
308
309
                results.append(pool.apply_async(totrtale_request, args=[sub_params]))
                if prev_j == 0:
                    single_docs.append(0)
                else:
                    single_docs.append(1)
                prev_j = curr_j+2
                curr_j += SINGLE_DOC_SIZE
                document_num+=1
                process_num += 1
                
                if curr_j > doc_len:
                    sub_params = copy.deepcopy(params)
                    sub_params["text"] = predhead + head + document_text[prev_j:] + tail
hiphop's avatar
hiphop committed
310
                    sub_params["doc_id"] = str(len(results))
hiphop's avatar
hiphop committed
311
                    results.append(pool.apply_async(totrtale_request, args=[sub_params]))
hiphop's avatar
hiphop committed
312
                    document_num += 1
hiphop's avatar
hiphop committed
313
314
                    process_num += 1
                    single_docs.append(2)
hiphop's avatar
hiphop committed
315
            print "document was split",doc_title, len(single_docs)
hiphop's avatar
hiphop committed
316
        else:
317
            #group multiple smaller documents.
hiphop's avatar
hiphop committed
318
319
320
321
            docs.append(document.toxml())
            document_num+=1
            documents_size += doc_len
            
hiphop's avatar
hiphop committed
322
            if documents_size > DOCUMENTS_SIZE or (document_num) % 10==0 or i == len(documents)-1:
hiphop's avatar
hiphop committed
323
324
325
326
                documents_size = 0
                document_num = 0
                sub_params = copy.deepcopy(params)
                sub_params["text"] = "\n".join(docs)
hiphop's avatar
hiphop committed
327
                sub_params["doc_id"] = str(len(results))
hiphop's avatar
hiphop committed
328
                print "whole document was added", len(docs)
hiphop's avatar
hiphop committed
329
330
331
332
                results.append(pool.apply_async(totrtale_request, args=[sub_params]))
                process_num += 1
                docs = []
                single_docs.append(-1)
hiphop's avatar
hiphop committed
333
334
    pool.close()

335
    #we need to join results of totrtale processing back together. Funtion also updates progress bar.
hiphop's avatar
hiphop committed
336
337
338
339
340
    response = ["" for i in results]
    progress = [True]
    while any(progress):
        time.sleep(1)
        progress = [not result.ready() for result in results]
hiphop's avatar
hiphop committed
341
        print progress
hiphop's avatar
hiphop committed
342
343
        for i, prog in enumerate(progress):
            if not prog and response[i] == "":
hiphop's avatar
hiphop committed
344
345
346
347
348
                try:
                    resp=json.loads(results[i].get().content)[u'runToTrTaLeResponse'][u'runToTrTaLeResult']
                except Exception as e:
                    raise Exception("There was a problem processing your file.")

hiphop's avatar
hiphop committed
349
350
351
                if resp["error"] != "":
                    progress = [False]
                    raise Exception(resp["error"])
hiphop's avatar
hiphop committed
352
                if xml:
353
                    #results are in xml
hiphop's avatar
hiphop committed
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
                    if single_docs[i] == 0:
                        print "remove back", i
                        pos1 = resp["resp"].find("<s>")
                        pos2 = resp["resp"].find("</p>")
                        response[i] = predhead + header + head + resp["resp"][pos1:pos2]    
                    elif single_docs[i] == 2:
                        print "remove front", i
                        pos1 = resp["resp"].find("<s>")
                        response[i] = resp["resp"][pos1:]
                    elif single_docs[i] == 1:
                        print "remove both", i
                        pos1 = resp["resp"].find("<s>")
                        pos2 = resp["resp"].find("</p>")
                        response[i] = resp["resp"][pos1:pos2]
                    else:
                        print "nothing to remove"
                        response[i] = resp["resp"]
hiphop's avatar
hiphop committed
371
                else:
372
                    #results are tab separated
hiphop's avatar
hiphop committed
373
374
375
376
377
378
                    if single_docs[i] in [0,1]:
                        pos2 = resp["resp"].find("</TEXT>")
                        response[i] = resp["resp"][:pos2]    
                    else:
                        print "nothing to remove"
                        response[i] = resp["resp"]
hiphop's avatar
hiphop committed
379

hiphop's avatar
hiphop committed
380
                progress_accumulator += 1/float(len(results))*100
hiphop's avatar
hiphop committed
381
                print progress_accumulator
hiphop's avatar
hiphop committed
382
383
                widget.progress = math.floor(progress_accumulator)

hiphop's avatar
hiphop committed
384
385
                widget.save()
    pool.join()
hiphop's avatar
hiphop committed
386
    
387
388
    #return output only if all processes are completed.
    if not any(progress): 
hiphop's avatar
hiphop committed
389
390
391
        widget.progress=100
        widget.save()
        response = "".join(response)
romanorac's avatar
romanorac committed
392

hiphop's avatar
hiphop committed
393
394
395
        if tei_corpus and xml:
            response = tei_head + tei_header + response + tei_tail
        return {'annotations': response}
396
397
398
399
400
401
402

def nlp_totrtale(input_dict):
    '''
    Calls the totrtale web service.
    '''
    corpus = input_dict['corpus']
    lang = input_dict['lang']
403
404
405
406
407
    wsdl = input_dict.get('wsdl', 'http://vihar.ijs.si:8095/totale?wsdl')
    xml = input_dict['xml'] == 'true'
    postprocess = input_dict['postprocess'] == 'true'
    bohoricica = input_dict['bohoricica'] == 'true'
    antique = input_dict['antique'] == 'true'
hiphop's avatar
hiphop committed
408

409
410
411
412
413
414
415
416
417
418
    ws = WebService(wsdl, 60000)
    response = ws.client.runTotale(inFile=corpus, language=lang,
                                   postProcessing=postprocess,
                                   bohoricica=bohoricica,
                                   antiqueSlovenian=antique,
                                   outputAsXML=xml)
    errors = response['error']
    if errors:
        print errors
    return {'annotations': response['annotatedFile']}
419
420
421
422
423
424
425
426
427


def nlp_term_extraction(input_dict):
    '''
    Term extraction from totrtale annotations.
    '''
    annotations = input_dict['annotations']
    lang = input_dict['lang']
    wsdl = input_dict.get('wsdl', 'http://vihar.ijs.si:8095/totale?wsdl')
romanorac's avatar
romanorac committed
428
429

    if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
hiphop's avatar
hiphop committed
430
        annotations = TEItoTab(annotations)
romanorac's avatar
romanorac committed
431

432
    ws = WebService(wsdl, 60000)
433
434
    response = ws.client.TermExtraction(corpus=annotations, lang=lang,
                                        threshold=0)
435
436
    return {'candidates': response['candidates']}

hiphop's avatar
hiphop committed
437
438
def get_default_stop_word_list(lang):
    if lang == "en":
439
        return ["et al", "example", "use", "source", "method", "approach", "table", "figure", "percentage"]
hiphop's avatar
hiphop committed
440
    elif lang == "sl":
441
        return ["itd", "primer", "uporaba", "vir", "metoda", "pristop", "tabela", "slika", "odstotek"]
hiphop's avatar
hiphop committed
442
443
444
445
446

def nlp_term_extraction2(input_dict):
    '''
    Term extraction from totrtale annotations.
    '''
hiphop's avatar
hiphop committed
447
    ws_url = webservice_def_ex_url + "/call"
hiphop's avatar
hiphop committed
448
449
450
451
452
453
454
455
456
457
458
    annotations = input_dict['annotations']
    lang = input_dict['lang']
    stop_list_checkbox = input_dict["stop_list"] == "true"
    user_stop_words = []

    if input_dict['stop_words_file'] != "":
        user_stop_words = safeOpen(input_dict['stop_words_file']).read()
        try:
            user_stop_words.decode("utf-8")
        except Exception:
            raise Exception("Please make sure that your stop words list is encoded in UTF-8.")
459
        user_stop_words = [word.strip() for word in user_stop_words.split("\n")]
hiphop's avatar
hiphop committed
460
461
462
463
464
465
466
467
468
469
470
471

    if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
        annotations = TEItoTab(annotations)
    
    if lang == "sl":
        reference_corpus = input_dict["slovene_reference_corpus"]
    elif lang == "en":
        reference_corpus = input_dict["english_reference_corpus"]
    
    params = {"corpus":annotations,
              "lang": lang,
              "reference_corpus":reference_corpus}
472
    response = post(ws_url, data=params)
hiphop's avatar
hiphop committed
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
    resp = json.loads(response.content)[u'callResponse'][u'callResult']

    stop_list = []
    if stop_list_checkbox:
        stop_list = get_default_stop_word_list(lang)
    stop_list = set(stop_list + user_stop_words)

    if len(stop_list) > 0:
        resp = resp.split("\n")
        i=0
        while i < len(resp):
            increase = True
            line = resp[i]
            if len(line) > 0:
                term = line.split("\t")[1][1:-1]
                for word in term.split(" "):
                    if word.lower() in stop_list:
                        increase = False
                        resp.pop(i)
                        break
            if increase:
                i+=1
        resp = "\n".join(resp)
    return {'candidates': resp}

498

499
def nlp_def_extraction_patterns(input_dict):
500
    '''
501
    Definition extraction using pre-defined patterns.
502
    '''
503
504
    annotations = input_dict['annotations']
    lang = input_dict['lang']
505
    wsdl = input_dict.get('wsdl', 'http://vihar.ijs.si:8099')
romanorac's avatar
romanorac committed
506
507

    if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
hiphop's avatar
hiphop committed
508
        annotations = TEItoTab(annotations)
romanorac's avatar
romanorac committed
509

510
511
    ws = WebService(wsdl, 60000)
    pattern = input_dict['pattern']
512
513
    response = ws.client.GlossaryExtractionByPatterns(corpus=annotations,
                                                      lang=lang, pattern=pattern)
514
    return {'sentences': response['candidates']}
515

hiphop's avatar
hiphop committed
516
517
518
519
520
521
522
523
def nlp_def_extraction_patterns2(input_dict):
    '''
    Definition extraction using pre-defined patterns.
    '''
    annotations = input_dict['annotations']
    lang = input_dict['lang']
    pattern = input_dict['pattern']

hiphop's avatar
hiphop committed
524
525
526
527
    if lang == "sl" and pattern == "begin_allvar":
        raise Exception("Pattern begin_allvar is not supported for slovene language.")


hiphop's avatar
hiphop committed
528
529
530
    if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
        annotations = TEItoTab(annotations)

hiphop's avatar
hiphop committed
531
    ws_url = webservice_def_ex_url + "/patDefSent"
hiphop's avatar
hiphop committed
532
533
534
535
    params = {"corpus":annotations,
              "pattern":pattern,
              "lang":lang}
    
536
    response = post(ws_url, data=params)
hiphop's avatar
hiphop committed
537
538
539
    response = json.loads(response.content)[u'patDefSentResponse'][u'patDefSentResult']
    
    return {'sentences': response}
540

541
def nlp_def_extraction_terms(input_dict):
542
    '''
543
    Definition extraction using terms.
544
    '''
545
546
547
    annotations = input_dict['annotations']
    term_candidates = input_dict['term_candidates']
    lang = input_dict['lang']
548
    wsdl = input_dict.get('wsdl', 'http://vihar.ijs.si:8099')
549
550
551
552
553
554
555
    terms_per_sentence = input_dict['terms_per_sentence']
    nominatives = input_dict['nominatives']
    threshold = input_dict['threshold']
    verb_two_terms = input_dict['verb_two_terms']
    multiword_term = input_dict['multiword_term']
    num_multiterms = input_dict['num_multiterms']
    term_beginning = input_dict['term_beginning']
romanorac's avatar
romanorac committed
556
557

    if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
hiphop's avatar
hiphop committed
558
        annotations = TEItoTab(annotations)
romanorac's avatar
romanorac committed
559
    
560
561
562
563
564
565
566
    ws = WebService(wsdl, 60000)
    response = ws.client.GlossaryExtractionByTerms(corpus=annotations,
        candidates=term_candidates, lang=lang, nominatives=nominatives,
        termsPerSent=terms_per_sentence, select=threshold, 
        verb_two_terms=verb_two_terms, multiword_term=multiword_term,
        num_multiterms=num_multiterms, term_beginning=term_beginning)
    return {'sentences': response['candidates']}
567

hiphop's avatar
hiphop committed
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
def nlp_def_extraction_terms2(input_dict):
    '''
    Definition extraction using terms.
    '''
    annotations = input_dict['annotations']
    term_candidates = input_dict['term_candidates']
    lang = input_dict['lang']
    terms_per_sentence = input_dict['terms_per_sentence']
    nominatives = input_dict['nominatives']
    threshold = input_dict['threshold']
    verb_two_terms = input_dict['verb_two_terms']
    multiword_term = input_dict['multiword_term']
    num_multiterms = input_dict['num_multiterms']
    term_beginning = input_dict['term_beginning']

    if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
        annotations = TEItoTab(annotations)

hiphop's avatar
hiphop committed
586
    ws_url = webservice_def_ex_url + "/termDefSent"
hiphop's avatar
hiphop committed
587
588
589
590
591
592
593
594
595
596
    params = {"corpus":annotations,
              "candidates":term_candidates,
              "lang":lang,
              "nominatives":nominatives,
              "terms_per_sentence":terms_per_sentence,
              "select": threshold,
              "verb_two_terms":verb_two_terms,
              "multiword_term":multiword_term,
              "num_multiterms":num_multiterms,
              "term_beginning":term_beginning}
597
    response = post(ws_url, data=params)
hiphop's avatar
hiphop committed
598
599
600
601
    response = json.loads(response.content)[u'termDefSentResponse'][u'termDefSentResult']

    return {'sentences': response}

602
603
604
605
606

def nlp_def_extraction_wnet(input_dict):
    '''
    Definition extraction using WordNet.
    '''
607
608
    annotations = input_dict['annotations']
    lang = input_dict['lang']
609
    wsdl = input_dict.get('wsdl', 'http://vihar.ijs.si:8099')
romanorac's avatar
romanorac committed
610
611
    
    if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
hiphop's avatar
hiphop committed
612
        annotations = TEItoTab(annotations)
romanorac's avatar
romanorac committed
613

614
615
616
    ws = WebService(wsdl, 60000)
    response = ws.client.GlossaryExtractionByWnet(corpus=annotations, lang=lang)
    return {'sentences': response['candidates']}
romanorac's avatar
romanorac committed
617

hiphop's avatar
hiphop committed
618
619
620
621
622
623
624
625
626
627
def nlp_def_extraction_wnet2(input_dict):
    '''
    Definition extraction using WordNet.
    '''
    annotations = input_dict['annotations']
    lang = input_dict['lang']
    
    if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
        annotations = TEItoTab(annotations)

hiphop's avatar
hiphop committed
628
    ws_url = webservice_def_ex_url + "/wnetDefSent"
hiphop's avatar
hiphop committed
629
630
    params = {"corpus":annotations,
              "lang":lang}
631
    response = post(ws_url, data=params)
hiphop's avatar
hiphop committed
632
633
    response = json.loads(response.content)[u'wnetDefSentResponse'][u'wnetDefSentResult']
    return {'sentences': response}
romanorac's avatar
romanorac committed
634

hiphop's avatar
hiphop committed
635
def TEItoTab(text, doc_id=0):    
romanorac's avatar
romanorac committed
636
    mask1 = ["\tTOK\t", "\t", "\t\n"]
hiphop's avatar
hiphop committed
637
    pattern1 = "<w (type=\"unknown\")| lemma=\"(?P<lemma>.*?)\" ana=\"(?P<ana>.*?)\">(?P<value>.*?)</w>"
romanorac's avatar
romanorac committed
638
639
    pattern2 = "<title>(.*?)</title>"
    pattern3 = "<pc>(.*?)</pc>"
hiphop's avatar
hiphop committed
640
641
642
    
    pattern4 = "(.*?)\t(TOK)\t(.*?)\t(Y)"
    pattern5 = "(.*?)\t(TOK)\t(.*?)\t(Mdo|Mdc)"
hiphop's avatar
hiphop committed
643
644

    pattern6 = "<w>(.*)</w>"
romanorac's avatar
romanorac committed
645
    newText=[]
hiphop's avatar
hiphop committed
646
647
648
649
    print "TEItoTab started"
    sentence_id = 0
    choice_found=False #if lang in ["gaji", "boho"]
    local_s=""
romanorac's avatar
romanorac committed
650
    for l in text.splitlines():
hiphop's avatar
hiphop committed
651
        print l
hiphop's avatar
hiphop committed
652
653
654
655
656
657
658
659
660
661
        
        if "<choice>" in l:
            choice_found=True
            first = True
            continue
        elif choice_found and "<w" in l:
            local_s = re.findall(pattern6, l)[0]
            choice_found=False
            continue

romanorac's avatar
romanorac committed
662
        if "<w" in l:
hiphop's avatar
hiphop committed
663
664
665
666
667
668
669
670
671
            match = [m.group("value", "lemma", "ana") for m in re.finditer(pattern1, l)]
            if len(match) == 0:
                local_s += " " + re.findall(pattern6, l)[0]
            
            elif len(match) == 1:
                match = match[0]
                
            elif len(match) == 2:
                match = match[1]
hiphop's avatar
hiphop committed
672
673
674
675
676
677
678
679
680
            l = ''.join(itertools.chain.from_iterable(zip(match, mask1)))
            if len(l) < 100:
                value = re.findall(pattern4, l)
                if len(value) > 0:
                    l = "\t".join(value[0]).replace("TOK", "TOK_ABBR") + "\t\n"

                value = re.findall(pattern5, l)
                if len(value) > 0:
                    l = "\t".join(value[0]).replace("TOK", "TOK_DIG") + "\t\n"
hiphop's avatar
hiphop committed
681
682
683
            if len(local_s) > 0:
                l = local_s + "|" + l
                local_s = ""
hiphop's avatar
hiphop committed
684
            newText.append(l)
hiphop's avatar
hiphop committed
685
686
        elif "<s>" in l:
            newText.append("\t\t<S id=\"" + str(doc_id) + "_" + str(sentence_id) + "\">\t\n")
romanorac's avatar
romanorac committed
687
        elif "</s>" in l:
hiphop's avatar
hiphop committed
688
689
            newText.append("\t\t</S>\t\n")
            sentence_id+=1
romanorac's avatar
romanorac committed
690
691
692
693
694
        elif "<pc>" in l:
            value = re.findall(pattern3, l)[0]
            if value == ".":
                newText.append(value+"\t\tPUN_TERM\t\n")
            else:
hiphop's avatar
hiphop committed
695
                value = value.replace("&amp;","&").replace("&lt;","<").replace("&gt;", ">").replace("&quot;","\"")
romanorac's avatar
romanorac committed
696
697
698
                newText.append(value+"\t\tPUN\t\n")
        elif "<title>" in l:
            title = re.findall(pattern2, l)[0]
hiphop's avatar
hiphop committed
699
            title = title.replace("&amp;","&").replace("&lt;","<").replace("&gt;", ">").replace("&quot;","\"")
romanorac's avatar
romanorac committed
700
701
702
            newText.append("<TEXT title=" + title + ">\t\n")
        elif "</body>" in l:
            newText.append("</TEXT>\t\n")
703
704
705
706
707
    return "".join(newText)

def definition_sentences2(input_dict):
    return {}