library.py 21.9 KB
Newer Older
1 2 3
import nlp
import os.path
import base64
4
from services.webservice import WebService
5
from workflows.security import safeOpen
hiphop's avatar
hiphop committed
6
from requests import post
romanorac's avatar
romanorac committed
7 8 9
import json
import re
import itertools
10

hiphop's avatar
hiphop committed
11 12 13
def definition_sentences2(input_dict):
    return {}

14 15 16 17 18 19 20
def merge_sentences(input_dict):
    """
    Merges the input sentences in XML according to the specified method.
    """
    method = input_dict['method']
    merged_sen, id_to_sent = set(), {}
    ids_list = []
21
    for i, sentsXML in enumerate(input_dict['sentences']):
22 23 24 25 26 27
        sents = nlp.parse_def_sentences(sentsXML)
        ids = set(map(lambda x: x['id'], sents))
        ids_list.append(ids)
        # Save the map from id to sentence
        for sent in sents:
            id_to_sent[sent['id']] = sent
28
        if i == 0 and method != 'intersection_two':
29 30 31 32 33 34
            merged_sen = ids
        if method == 'union':
            merged_sen = merged_sen | ids
        elif method == 'intersection':
            merged_sen = merged_sen & ids
        elif method == 'intersection_two':
35 36 37 38 39
            # Skip the current set of sentences
            # and intersect it with the others.
            for ids_alt in ids_list[:i] + ids_list[i+1:]:
                # As long as (at least) two sets agree with a sentence it 
                # will be in the resulting set.
40
                merged_sen = merged_sen | (ids_alt & ids)
41 42
    return {'merged_sentences': nlp.sentences_to_xml([id_to_sent[sid] for sid in merged_sen])}

hiphop's avatar
hiphop committed
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
def merge_sentences2(input_dict):
    """
    Merges the input sentences in XML according to the specified method.
    """
    method = input_dict['method']
    merged_sen, id_to_sent = set(), {}
    ids_list = []
    for i, sentsXML in enumerate(input_dict['sentences']):
        sents = nlp.parse_def_sentences2(sentsXML)
        ids = set(map(lambda x: x['id'], sents))
        ids_list.append(ids)
        # Save the map from id to sentence
        for sent in sents:
            id_to_sent[sent['id']] = sent
        if i == 0 and method != 'intersection_two':
            merged_sen = ids
        if method == 'union':
            merged_sen = merged_sen | ids
        elif method == 'intersection':
            merged_sen = merged_sen & ids
        elif method == 'intersection_two':
            # Skip the current set of sentences
            # and intersect it with the others.
            for ids_alt in ids_list[:i] + ids_list[i+1:]:
                # As long as (at least) two sets agree with a sentence it 
                # will be in the resulting set.
                merged_sen = merged_sen | (ids_alt & ids)
    return {'merged_sentences': nlp.sentences_to_xml2([id_to_sent[sid] for sid in merged_sen])}

72 73 74 75 76 77 78

def load_corpus(input_dict):
    '''
    Parses an input file and encodes it in base 64.
    '''
    f = safeOpen(input_dict['file'])
    fname = os.path.basename(input_dict['file'])
79
    wsdl = input_dict.get('wsdl', 'http://vihar.ijs.si:8095/totale?wsdl')
80
    data = base64.b64encode(f.read())
81 82 83
    ws = WebService(wsdl, 60000)
    response = ws.client.parseFile(fileName=fname, inFile=data)
    return {'corpus': response['parsedFile']}
84

romanorac's avatar
romanorac committed
85 86 87 88
def load_corpus2(input_dict):
    '''
    Parses an input file and encodes it in base 64.
    '''
hiphop's avatar
hiphop committed
89
    use_text = input_dict["use_text"] == "true"
romanorac's avatar
romanorac committed
90

hiphop's avatar
hiphop committed
91 92 93 94 95 96 97
    if use_text:
        fname = "input_string.txt"
        text = input_dict[u"text"].strip()
        if len(text) == 0:
            raise Exception("Please input text or uncheck the Use text checkbox.")
        data = base64.b64encode(text)
    else:
romanorac's avatar
romanorac committed
98 99 100 101 102
        f = safeOpen(input_dict['file'])
        fname = os.path.basename(input_dict['file'])
        data = base64.b64encode(f.read())
    
    #define web service
hiphop's avatar
hiphop committed
103
    webservices_url = "http://vihar.ijs.si:8104"
romanorac's avatar
romanorac committed
104 105 106 107
    webservice_url = webservices_url + "/parseFile"
    params = {"filename": fname, "text": data} #set params
    
    #call web service
hiphop's avatar
hiphop committed
108
    resp = post(webservice_url, params=params)
romanorac's avatar
romanorac committed
109
    content = json.loads(resp.content)[u'parseFileResponse'][u'parseFileResult']
hiphop's avatar
hiphop committed
110
    """
romanorac's avatar
romanorac committed
111 112 113
    if content[u"error"] != "":
        raise Exception(content[u"error"])
    else:
hiphop's avatar
hiphop committed
114 115
    """
    return {'corpus': content[u"resp"]}
romanorac's avatar
romanorac committed
116 117 118 119 120 121 122 123 124 125 126

def load_tagged_corpus(input_dict):
    """
    Loads TEI file, which is output of totrtale
    """
    f = safeOpen(input_dict['file'])
    #fname = os.path.basename(input_dict['file'])
    #subprocess.call(["java -jar jing.jar tei_imp.rng " + fname + " >" + "out.txt"],shell=True)
    data = f.read()
    return {'annotations': data}

hiphop's avatar
hiphop committed
127
def totrtale_request(params):
hiphop's avatar
hiphop committed
128
    webservices_url = "http://vihar.ijs.si:8104"
hiphop's avatar
hiphop committed
129 130 131 132
    webservice_url = webservices_url + "/runToTrTaLe"
    return post(webservice_url, params=params)

def nlp_totrtale2(input_dict, widget):
romanorac's avatar
romanorac committed
133 134 135
    '''
    Calls the totrtale web service.
    '''
hiphop's avatar
hiphop committed
136 137 138 139
    import multiprocessing
    from xml.dom.minidom import parseString
    import time
    import math
hiphop's avatar
hiphop committed
140
    import copy
romanorac's avatar
romanorac committed
141

hiphop's avatar
hiphop committed
142 143 144
    progress_accumulator = 0
    widget.progress= progress_accumulator
    widget.save()
romanorac's avatar
romanorac committed
145

hiphop's avatar
hiphop committed
146 147
    processes = 4
    DOCUMENTS_SIZE = 3 * int(1e6) #Document size (MB) per process
hiphop's avatar
hiphop committed
148 149
    SINGLE_DOC_SIZE = 1 * int(1e6)
    
hiphop's avatar
hiphop committed
150
    corpus = parseString(input_dict['corpus'])
hiphop's avatar
hiphop committed
151
    
hiphop's avatar
hiphop committed
152 153 154
    language = input_dict['lang'], 
    postprocess = input_dict['postprocess'] == "true"
    xml = input_dict['xml'] == "true"
hiphop's avatar
hiphop committed
155

hiphop's avatar
hiphop committed
156 157 158
    params = {"language": language, 
            "postprocess": postprocess, 
            "xml":xml}
hiphop's avatar
hiphop committed
159
             
hiphop's avatar
hiphop committed
160 161 162 163 164 165 166 167 168 169
    tei_corpus = corpus.getElementsByTagName('teiCorpus')
    if tei_corpus:
        tei_head = '<?xml version="1.0" encoding="utf-8"?>\n' + \
                   '<teiCorpus xmlns="http://www.tei-c.org/ns/1.0">\n'
        tei_header = corpus.getElementsByTagName('teiHeader')[0].toxml() + "\n"
        tei_tail = '</teiCorpus>'

    pool = multiprocessing.Pool(processes=processes)
    documents = corpus.getElementsByTagName('TEI')
    documents_size, document_num, process_num = 0, 0, 1
hiphop's avatar
hiphop committed
170
    #titles = []
hiphop's avatar
hiphop committed
171

hiphop's avatar
hiphop committed
172
    results, docs, single_docs = [], [], []
hiphop's avatar
hiphop committed
173
    for i, document in enumerate(documents):
hiphop's avatar
hiphop committed
174
        doc_len = len(document.getElementsByTagName('body')[0].getElementsByTagName('p')[0].childNodes[0].nodeValue)
hiphop's avatar
hiphop committed
175 176 177
        doc_title = document.getElementsByTagName('title')[0].firstChild.nodeValue
        #titles.append(doc_title)
        print doc_title
hiphop's avatar
hiphop committed
178
        if doc_len > SINGLE_DOC_SIZE:
hiphop's avatar
hiphop committed
179
            
hiphop's avatar
hiphop committed
180
            predhead = '<TEI xmlns="http://www.tei-c.org/ns/1.0">\n'
hiphop's avatar
hiphop committed
181
            title = '<title>' + doc_title + '</title>\n'
hiphop's avatar
hiphop committed
182 183 184 185 186 187
            head = '<text>\n<body>\n<p>\n'
            header = document.getElementsByTagName('teiHeader')[0].toxml() + "\n"
            tail = '\n</p>\n</body>\n</text>\n</TEI>'
            

            document_text = document.getElementsByTagName('body')[0].getElementsByTagName('p')[0].childNodes[0].nodeValue.strip().replace("&","&amp;").replace("<","&lt;").replace(">","&gt;").replace("\"","&quot;")
hiphop's avatar
hiphop committed
188

hiphop's avatar
hiphop committed
189 190 191 192 193
            prev_j, curr_j  = 0, SINGLE_DOC_SIZE
            while (curr_j+2) < len(document_text):
                while (curr_j+2) < len(document_text) and document_text[curr_j:curr_j+2] != ". ":
                    curr_j+=1
                sub_params = copy.deepcopy(params)
hiphop's avatar
hiphop committed
194 195 196 197
                if prev_j == 0:
                    sub_params["text"] = predhead +title + head + document_text[prev_j: curr_j+2] +tail
                else:
                    sub_params["text"] = predhead + head + document_text[prev_j: curr_j+2] + tail
hiphop's avatar
hiphop committed
198
                sub_params["doc_id"] = str(len(results))
hiphop's avatar
hiphop committed
199 200 201 202 203 204 205 206 207 208 209 210 211
                results.append(pool.apply_async(totrtale_request, args=[sub_params]))
                if prev_j == 0:
                    single_docs.append(0)
                else:
                    single_docs.append(1)
                prev_j = curr_j+2
                curr_j += SINGLE_DOC_SIZE
                document_num+=1
                process_num += 1
                
                if curr_j > doc_len:
                    sub_params = copy.deepcopy(params)
                    sub_params["text"] = predhead + head + document_text[prev_j:] + tail
hiphop's avatar
hiphop committed
212
                    sub_params["doc_id"] = str(len(results))
hiphop's avatar
hiphop committed
213
                    results.append(pool.apply_async(totrtale_request, args=[sub_params]))
hiphop's avatar
hiphop committed
214
                    document_num += 1
hiphop's avatar
hiphop committed
215 216
                    process_num += 1
                    single_docs.append(2)
hiphop's avatar
hiphop committed
217
            print "document was split",doc_title, len(single_docs)
hiphop's avatar
hiphop committed
218 219 220 221 222
        else:
            docs.append(document.toxml())
            document_num+=1
            documents_size += doc_len
            
hiphop's avatar
hiphop committed
223
            if documents_size > DOCUMENTS_SIZE or (document_num) % 10==0 or i == len(documents)-1:
hiphop's avatar
hiphop committed
224 225 226 227
                documents_size = 0
                document_num = 0
                sub_params = copy.deepcopy(params)
                sub_params["text"] = "\n".join(docs)
hiphop's avatar
hiphop committed
228
                sub_params["doc_id"] = str(len(results))
hiphop's avatar
hiphop committed
229
                print "whole document was added", len(docs)
hiphop's avatar
hiphop committed
230 231 232 233
                results.append(pool.apply_async(totrtale_request, args=[sub_params]))
                process_num += 1
                docs = []
                single_docs.append(-1)
hiphop's avatar
hiphop committed
234 235
    pool.close()

hiphop's avatar
hiphop committed
236 237


hiphop's avatar
hiphop committed
238 239 240 241 242 243
    response = ["" for i in results]
    progress = [True]
    
    while any(progress):
        time.sleep(1)
        progress = [not result.ready() for result in results]
hiphop's avatar
hiphop committed
244
        print progress
hiphop's avatar
hiphop committed
245 246
        for i, prog in enumerate(progress):
            if not prog and response[i] == "":
hiphop's avatar
hiphop committed
247 248 249 250 251
                try:
                    resp=json.loads(results[i].get().content)[u'runToTrTaLeResponse'][u'runToTrTaLeResult']
                except Exception as e:
                    raise Exception("There was a problem processing your file.")

hiphop's avatar
hiphop committed
252 253 254
                if resp["error"] != "":
                    progress = [False]
                    raise Exception(resp["error"])
hiphop's avatar
hiphop committed
255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272
                if xml:
                    if single_docs[i] == 0:
                        print "remove back", i
                        pos1 = resp["resp"].find("<s>")
                        pos2 = resp["resp"].find("</p>")
                        response[i] = predhead + header + head + resp["resp"][pos1:pos2]    
                    elif single_docs[i] == 2:
                        print "remove front", i
                        pos1 = resp["resp"].find("<s>")
                        response[i] = resp["resp"][pos1:]
                    elif single_docs[i] == 1:
                        print "remove both", i
                        pos1 = resp["resp"].find("<s>")
                        pos2 = resp["resp"].find("</p>")
                        response[i] = resp["resp"][pos1:pos2]
                    else:
                        print "nothing to remove"
                        response[i] = resp["resp"]
hiphop's avatar
hiphop committed
273
                else:
hiphop's avatar
hiphop committed
274 275 276 277 278 279
                    if single_docs[i] in [0,1]:
                        pos2 = resp["resp"].find("</TEXT>")
                        response[i] = resp["resp"][:pos2]    
                    else:
                        print "nothing to remove"
                        response[i] = resp["resp"]
hiphop's avatar
hiphop committed
280

hiphop's avatar
hiphop committed
281
                progress_accumulator += 1/float(len(results))*100
hiphop's avatar
hiphop committed
282
                print progress_accumulator
hiphop's avatar
hiphop committed
283 284
                widget.progress = math.floor(progress_accumulator)

hiphop's avatar
hiphop committed
285 286
                widget.save()
    pool.join()
hiphop's avatar
hiphop committed
287
    
hiphop's avatar
hiphop committed
288 289 290 291
    if not any(progress):
        widget.progress=100
        widget.save()
        response = "".join(response)
romanorac's avatar
romanorac committed
292

hiphop's avatar
hiphop committed
293 294 295
        if tei_corpus and xml:
            response = tei_head + tei_header + response + tei_tail
        return {'annotations': response}
296 297 298 299 300 301 302

def nlp_totrtale(input_dict):
    '''
    Calls the totrtale web service.
    '''
    corpus = input_dict['corpus']
    lang = input_dict['lang']
303 304 305 306 307
    wsdl = input_dict.get('wsdl', 'http://vihar.ijs.si:8095/totale?wsdl')
    xml = input_dict['xml'] == 'true'
    postprocess = input_dict['postprocess'] == 'true'
    bohoricica = input_dict['bohoricica'] == 'true'
    antique = input_dict['antique'] == 'true'
hiphop's avatar
hiphop committed
308

309 310 311 312 313 314 315 316 317 318 319
    ws = WebService(wsdl, 60000)
    response = ws.client.runTotale(inFile=corpus, language=lang,
                                   postProcessing=postprocess,
                                   bohoricica=bohoricica,
                                   antiqueSlovenian=antique,
                                   outputAsXML=xml)
    errors = response['error']
    if errors:
        # todo report this as warning
        print errors
    return {'annotations': response['annotatedFile']}
320 321 322 323 324 325 326 327 328


def nlp_term_extraction(input_dict):
    '''
    Term extraction from totrtale annotations.
    '''
    annotations = input_dict['annotations']
    lang = input_dict['lang']
    wsdl = input_dict.get('wsdl', 'http://vihar.ijs.si:8095/totale?wsdl')
romanorac's avatar
romanorac committed
329 330

    if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
hiphop's avatar
hiphop committed
331
        annotations = TEItoTab(annotations)
romanorac's avatar
romanorac committed
332

333
    ws = WebService(wsdl, 60000)
334 335
    response = ws.client.TermExtraction(corpus=annotations, lang=lang,
                                        threshold=0)
336 337
    return {'candidates': response['candidates']}

hiphop's avatar
hiphop committed
338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398
def get_default_stop_word_list(lang):
    if lang == "en":
        return ["et al"]
    elif lang == "sl":
        return ["itd", "slon", "ovira", "zob"]

def nlp_term_extraction2(input_dict):
    '''
    Term extraction from totrtale annotations.
    '''
    ws_url = "http://vihar.ijs.si:8081/call"
    annotations = input_dict['annotations']
    lang = input_dict['lang']
    stop_list_checkbox = input_dict["stop_list"] == "true"
    user_stop_words = []

    if input_dict['stop_words_file'] != "":
        user_stop_words = safeOpen(input_dict['stop_words_file']).read()
        try:
            user_stop_words.decode("utf-8")
        except Exception:
            raise Exception("Please make sure that your stop words list is encoded in UTF-8.")
        user_stop_words = user_stop_words.split("\n")

    if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
        annotations = TEItoTab(annotations)
    
    if lang == "sl":
        reference_corpus = input_dict["slovene_reference_corpus"]
    elif lang == "en":
        reference_corpus = input_dict["english_reference_corpus"]
    
    params = {"corpus":annotations,
              "lang": lang,
              "reference_corpus":reference_corpus}
    response = post(ws_url, params=params)
    resp = json.loads(response.content)[u'callResponse'][u'callResult']

    stop_list = []
    if stop_list_checkbox:
        stop_list = get_default_stop_word_list(lang)
    stop_list = set(stop_list + user_stop_words)

    if len(stop_list) > 0:
        resp = resp.split("\n")
        i=0
        while i < len(resp):
            increase = True
            line = resp[i]
            if len(line) > 0:
                term = line.split("\t")[1][1:-1]
                for word in term.split(" "):
                    if word.lower() in stop_list:
                        increase = False
                        resp.pop(i)
                        break
            if increase:
                i+=1
        resp = "\n".join(resp)
    return {'candidates': resp}

399

400
def nlp_def_extraction_patterns(input_dict):
401
    '''
402
    Definition extraction using pre-defined patterns.
403
    '''
404 405
    annotations = input_dict['annotations']
    lang = input_dict['lang']
406
    wsdl = input_dict.get('wsdl', 'http://vihar.ijs.si:8099')
romanorac's avatar
romanorac committed
407 408

    if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
hiphop's avatar
hiphop committed
409
        annotations = TEItoTab(annotations)
romanorac's avatar
romanorac committed
410

411 412
    ws = WebService(wsdl, 60000)
    pattern = input_dict['pattern']
413 414
    response = ws.client.GlossaryExtractionByPatterns(corpus=annotations,
                                                      lang=lang, pattern=pattern)
415
    return {'sentences': response['candidates']}
416

hiphop's avatar
hiphop committed
417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436
def nlp_def_extraction_patterns2(input_dict):
    '''
    Definition extraction using pre-defined patterns.
    '''
    annotations = input_dict['annotations']
    lang = input_dict['lang']
    pattern = input_dict['pattern']

    if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
        annotations = TEItoTab(annotations)

    ws_url = "http://vihar.ijs.si:8081/patDefSent"
    params = {"corpus":annotations,
              "pattern":pattern,
              "lang":lang}
    
    response = post(ws_url, params=params)
    response = json.loads(response.content)[u'patDefSentResponse'][u'patDefSentResult']
    
    return {'sentences': response}
437

438
def nlp_def_extraction_terms(input_dict):
439
    '''
440
    Definition extraction using terms.
441
    '''
442 443 444
    annotations = input_dict['annotations']
    term_candidates = input_dict['term_candidates']
    lang = input_dict['lang']
445
    wsdl = input_dict.get('wsdl', 'http://vihar.ijs.si:8099')
446 447 448 449 450 451 452
    terms_per_sentence = input_dict['terms_per_sentence']
    nominatives = input_dict['nominatives']
    threshold = input_dict['threshold']
    verb_two_terms = input_dict['verb_two_terms']
    multiword_term = input_dict['multiword_term']
    num_multiterms = input_dict['num_multiterms']
    term_beginning = input_dict['term_beginning']
romanorac's avatar
romanorac committed
453 454

    if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
hiphop's avatar
hiphop committed
455
        annotations = TEItoTab(annotations)
romanorac's avatar
romanorac committed
456
    
457 458 459 460 461 462 463
    ws = WebService(wsdl, 60000)
    response = ws.client.GlossaryExtractionByTerms(corpus=annotations,
        candidates=term_candidates, lang=lang, nominatives=nominatives,
        termsPerSent=terms_per_sentence, select=threshold, 
        verb_two_terms=verb_two_terms, multiword_term=multiword_term,
        num_multiterms=num_multiterms, term_beginning=term_beginning)
    return {'sentences': response['candidates']}
464

hiphop's avatar
hiphop committed
465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498
def nlp_def_extraction_terms2(input_dict):
    '''
    Definition extraction using terms.
    '''
    annotations = input_dict['annotations']
    term_candidates = input_dict['term_candidates']
    lang = input_dict['lang']
    terms_per_sentence = input_dict['terms_per_sentence']
    nominatives = input_dict['nominatives']
    threshold = input_dict['threshold']
    verb_two_terms = input_dict['verb_two_terms']
    multiword_term = input_dict['multiword_term']
    num_multiterms = input_dict['num_multiterms']
    term_beginning = input_dict['term_beginning']

    if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
        annotations = TEItoTab(annotations)

    ws_url = "http://vihar.ijs.si:8081/termDefSent"
    params = {"corpus":annotations,
              "candidates":term_candidates,
              "lang":lang,
              "nominatives":nominatives,
              "terms_per_sentence":terms_per_sentence,
              "select": threshold,
              "verb_two_terms":verb_two_terms,
              "multiword_term":multiword_term,
              "num_multiterms":num_multiterms,
              "term_beginning":term_beginning}
    response = post(ws_url, params=params)
    response = json.loads(response.content)[u'termDefSentResponse'][u'termDefSentResult']

    return {'sentences': response}

499 500 501 502 503

def nlp_def_extraction_wnet(input_dict):
    '''
    Definition extraction using WordNet.
    '''
504 505
    annotations = input_dict['annotations']
    lang = input_dict['lang']
506
    wsdl = input_dict.get('wsdl', 'http://vihar.ijs.si:8099')
romanorac's avatar
romanorac committed
507 508
    
    if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
hiphop's avatar
hiphop committed
509
        annotations = TEItoTab(annotations)
romanorac's avatar
romanorac committed
510

511 512 513
    ws = WebService(wsdl, 60000)
    response = ws.client.GlossaryExtractionByWnet(corpus=annotations, lang=lang)
    return {'sentences': response['candidates']}
romanorac's avatar
romanorac committed
514

hiphop's avatar
hiphop committed
515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530
def nlp_def_extraction_wnet2(input_dict):
    '''
    Definition extraction using WordNet.
    '''
    annotations = input_dict['annotations']
    lang = input_dict['lang']
    
    if '<TEI xmlns="http://www.tei-c.org/ns/1.0">' in annotations:
        annotations = TEItoTab(annotations)

    ws_url = "http://vihar.ijs.si:8081/wnetDefSent"
    params = {"corpus":annotations,
              "lang":lang}
    response = post(ws_url, params=params)
    response = json.loads(response.content)[u'wnetDefSentResponse'][u'wnetDefSentResult']
    return {'sentences': response}
romanorac's avatar
romanorac committed
531

hiphop's avatar
hiphop committed
532
def TEItoTab(text, doc_id=0):    
romanorac's avatar
romanorac committed
533
    mask1 = ["\tTOK\t", "\t", "\t\n"]
hiphop's avatar
hiphop committed
534
    pattern1 = "<w (type=\"unknown\")| lemma=\"(?P<lemma>.*?)\" ana=\"(?P<ana>.*?)\">(?P<value>.*?)</w>"
romanorac's avatar
romanorac committed
535 536
    pattern2 = "<title>(.*?)</title>"
    pattern3 = "<pc>(.*?)</pc>"
hiphop's avatar
hiphop committed
537 538 539
    
    pattern4 = "(.*?)\t(TOK)\t(.*?)\t(Y)"
    pattern5 = "(.*?)\t(TOK)\t(.*?)\t(Mdo|Mdc)"
hiphop's avatar
hiphop committed
540 541

    pattern6 = "<w>(.*)</w>"
romanorac's avatar
romanorac committed
542
    newText=[]
hiphop's avatar
hiphop committed
543 544 545 546
    print "TEItoTab started"
    sentence_id = 0
    choice_found=False #if lang in ["gaji", "boho"]
    local_s=""
romanorac's avatar
romanorac committed
547
    for l in text.splitlines():
hiphop's avatar
hiphop committed
548 549 550 551 552 553 554 555 556 557
        
        if "<choice>" in l:
            choice_found=True
            first = True
            continue
        elif choice_found and "<w" in l:
            local_s = re.findall(pattern6, l)[0]
            choice_found=False
            continue

romanorac's avatar
romanorac committed
558
        if "<w" in l:
hiphop's avatar
hiphop committed
559 560 561 562 563 564 565 566 567
            match = [m.group("value", "lemma", "ana") for m in re.finditer(pattern1, l)]
            if len(match) == 0:
                local_s += " " + re.findall(pattern6, l)[0]
            
            elif len(match) == 1:
                match = match[0]
                
            elif len(match) == 2:
                match = match[1]
hiphop's avatar
hiphop committed
568 569 570 571 572 573 574 575 576
            l = ''.join(itertools.chain.from_iterable(zip(match, mask1)))
            if len(l) < 100:
                value = re.findall(pattern4, l)
                if len(value) > 0:
                    l = "\t".join(value[0]).replace("TOK", "TOK_ABBR") + "\t\n"

                value = re.findall(pattern5, l)
                if len(value) > 0:
                    l = "\t".join(value[0]).replace("TOK", "TOK_DIG") + "\t\n"
hiphop's avatar
hiphop committed
577 578 579
            if len(local_s) > 0:
                l = local_s + "|" + l
                local_s = ""
hiphop's avatar
hiphop committed
580
            newText.append(l)
hiphop's avatar
hiphop committed
581 582
        elif "<s>" in l:
            newText.append("\t\t<S id=\"" + str(doc_id) + "_" + str(sentence_id) + "\">\t\n")
romanorac's avatar
romanorac committed
583
        elif "</s>" in l:
hiphop's avatar
hiphop committed
584 585
            newText.append("\t\t</S>\t\n")
            sentence_id+=1
romanorac's avatar
romanorac committed
586 587 588 589 590
        elif "<pc>" in l:
            value = re.findall(pattern3, l)[0]
            if value == ".":
                newText.append(value+"\t\tPUN_TERM\t\n")
            else:
hiphop's avatar
hiphop committed
591
                value = value.replace("&amp;","&").replace("&lt;","<").replace("&gt;", ">").replace("&quot;","\"")
romanorac's avatar
romanorac committed
592 593 594
                newText.append(value+"\t\tPUN\t\n")
        elif "<title>" in l:
            title = re.findall(pattern2, l)[0]
hiphop's avatar
hiphop committed
595
            title = title.replace("&amp;","&").replace("&lt;","<").replace("&gt;", ">").replace("&quot;","\"")
romanorac's avatar
romanorac committed
596 597 598
            newText.append("<TEXT title=" + title + ">\t\n")
        elif "</body>" in l:
            newText.append("</TEXT>\t\n")
hiphop's avatar
hiphop committed
599
    return "".join(newText)