nlp.py 1.12 KB
Newer Older
Janez K's avatar
Janez K committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
'''
NLP common functions.

@author: Anze Vavpetic, 2012
'''
import xml.etree.ElementTree as xml

def parse_def_sentences(sentsXML):
    """
    Parses the candidate definition sentences from the input XML string.
    """
    dom = xml.fromstring(sentsXML)
    sents = dom.findall('S')               # Lists all the tagged sentences
    sentences = []
    for sent in sents:
        sentences.append({'id' : sent.attrib['sid_sp'], 'aid' : sent.attrib['aid_sp'], 'defvalue' : sent.attrib['defvalue'], 'txt' : sent.text.strip()})
    sentences = sorted(sentences, key = lambda x: x['id'])
    return sentences

def sentences_to_xml(sentences):
    root = xml.Element('definitions')
    for sent in sentences:
        el = xml.Element('S', attrib={'sid_sp' : sent['id'], 'aid_sp' : sent['aid'], 'defvalue' : sent['defvalue']})
        el.text = sent['txt']
        root.append(el)
    return xml.tostring(root, "UTF-8")
    

if __name__ == '__main__':
    # Run test
    pats = open(r'D:\programiranje\Glossary\patterns2.txt').read()
    sents = parse_def_sentences(pats)
    xml = sentences_to_xml(sents)
    print xml