Commit 4aecc231 authored by vpodpecan's avatar vpodpecan
Browse files

added ClowdFlows implementation of Bio3graph

parent 3d4c5605
This diff is collapsed.
from django.shortcuts import render
def bio3graph_filter_integers(request,input_dict,output_dict,widget):
return render(request, 'interactions/bio3graph_filter_integers.html',{'widget':widget,'intList':input_dict['intList']})
\ No newline at end of file
"""
Bio3graph triplet extractor.
@author: Vid Podpecan <vid.podpecan@ijs.si>
"""
def bio3graph_create_document(input_dict):
from triplet_extractor import data_structures as ds
fn = input_dict['docfile']
doc = ds.Document()
doc.loadString(open(fn).read())
return {'document': doc}
def bio3graph_split_sentences(input_dict):
from triplet_extractor import data_structures as ds
doc = input_dict['document']
ds.SentenceSplitter().splitNLTK(doc)
return {'document': doc}
def bio3graph_parse_sentences(input_dict):
from triplet_extractor import data_structures as ds
doc = input_dict['document']
if not doc.rawSentences:
raise TypeError('Input document is not split into sentences! Use splitter first.')
gtc = ds.GeniaTTC()
gtc.process(doc)
return {'document': doc}
def bio3graph_build_vocabulary(input_dict):
from triplet_extractor import tripletExtraction as te
voc = te.Vocabulary()
voc.loadCompounds_file(input_dict['compounds'])
voc.loadPredicates_files(activationFname=input_dict['activation'],
activations_rotate=input_dict['activation_rotate'],
inhibitionFname=input_dict['inhibition'],
bindingFname=input_dict['binding'],
activationFname_passive=input_dict['activation_passive'],
inhibitionFname_passive=input_dict['inhibition_passive'],
bindingFname_passive=input_dict['binding_passive'])
return {'vocabulary': voc}
def bio3graph_build_default_vocabulary(input_dict):
from triplet_extractor import tripletExtraction as te
from os.path import normpath, join, dirname
dname = normpath(dirname(__file__))
voc = te.Vocabulary()
voc.loadCompounds_file(join(dname, 'triplet_extractor/vocabulary/compounds.lst'))
voc.loadPredicates_files(activationFname=join(dname, 'triplet_extractor/vocabulary/activation.lst'),
activations_rotate=join(dname, 'triplet_extractor/vocabulary/activation_rotate.lst'),
inhibitionFname=join(dname, 'triplet_extractor/vocabulary/inhibition.lst'),
bindingFname=join(dname, 'triplet_extractor/vocabulary/binding.lst'),
activationFname_passive=join(dname, 'triplet_extractor/vocabulary/activation_pas.lst'),
inhibitionFname_passive=join(dname, 'triplet_extractor/vocabulary/inhibition_pas.lst'),
bindingFname_passive=join(dname, 'triplet_extractor/vocabulary/binding_pas.lst'))
return {'vocabulary': voc}
def bio3graph_extract_triplets(input_dict):
from triplet_extractor import tripletExtraction as te
voc = input_dict['vocabulary']
doc = input_dict['document']
ex = te.TripletExtractor(voc)
triplets = ex.extractTripletsNLP(doc, VP_CHECK_POS=1)
return {'triplets': triplets}
def bio3graph_normalise_triplets(input_dict):
from triplet_extractor import tripletExtraction as te
triplets = input_dict['triplets']
voc = input_dict['vocabulary']
ex = te.TripletExtractor(voc)
normalised = ex.normalizeTriplets(triplets)
return {'normalised_triplets': normalised}
def bio3graph_construct_triplet_network(input_dict):
from triplet_extractor import tripletExtraction as te
triplets = input_dict['triplets']
gk = te.TripletGraphConstructor(triplets)
graph = gk.export_networkx()
return {'network_object': graph}
def bio3graph_networkx_to_biomine(input_dict):
from triplet_extractor import graph_operations as gop
nwx = input_dict['network']
bmg = gop.export_to_BMG(nwx)
return {'biomine_graph': bmg}
def bio3graph_biomine_to_networkx(input_dict):
from triplet_extractor import graph_operations as gop
bmg = input_dict['biomine_graph']
nwx = gop.load_BMG_to_networkx(bmg)
return {'network_object': nwx}
def bio3graph_biomine_visualizer(input_dict):
return {'biomine_graph': input_dict.get('biomine_graph', None)}
def bio3graph_find_redundant_transitive_relations(input_dict):
from triplet_extractor import graph_operations as gop
initialNetwork = input_dict['initial_network']
newNetwork = input_dict['new_network']
result = gop.find_transitive_relations(initialNetwork, newNetwork)
return {'transitive_relations': result}
def bio3graph_remove_relations(input_dict):
from networkx import copy
nwx = copy.deepcopy(input_dict['network'])
relations = input_dict['relations']
for (fr, to, relType) in relations:
if nwx.has_edge(fr, to, relType):
nwx.remove_edge(fr, to, relType)
return {'pruned_graph': nwx}
def bio3graph_incremental_network_merge(input_dict):
from triplet_extractor import graph_operations as gop
old = input_dict['existing_network']
new = input_dict['new_network']
merged = gop.merge_incremental_graph(old,new)
return {'merged_network': merged}
def bio3graph_colour_relations(input_dict):
from triplet_extractor import graph_operations as gop
from networkx import copy
nwx = copy.deepcopy(input_dict['network'])
rels = input_dict['relations']
gop.colour_relations(nwx, rels)
return {'network': nwx}
def bio3graph_reset_colours(input_dict):
from triplet_extractor import graph_operations as gop
from networkx import copy
nwx = copy.deepcopy(input_dict['network'])
gop.reset_edge_colors(nwx)
return {'network': nwx}
import os
# === STANDARD PACKAGE SETTINGS ===
PACKAGE_ROOT = os.path.dirname(__file__)
# === AUTO IMPORT OPTIONS ===
#If auto_import_package_data is true then given data file is automatically imported when ClowdFlows project is newly deployed or refreshed from git
AUTO_IMPORT_DB = False
#For auto_import_package_data_replace_option description see the 'replace' option in workflows/import_package command
AUTO_IMPORT_DB_REPLACE_OPTION = True
#If file(s) other than ./db/package_data.json should be imported, auto_import_package_data_files should be corrected
AUTO_IMPORT_DB_FILES = [os.path.join(PACKAGE_ROOT,'db/package_data.json')]
<div id="widgetvisualization-{{widget.pk}}" rel="{{widget.pk}}" class="widgetvisualizationdialog" title="{{widget.name}} visualization" width=700 height=600>
<applet code="biomine.bmvis.BMVis"
archive="{{STATIC_URL}}bio3graph/bmvis.jar"
width="100%" height="100%">
<param name="graph" value="{{ MEDIA_URL }}{{ filename }}">
</applet>
</div>
\ No newline at end of file
from triplet_extractor import data_structures as ds
from triplet_extractor import tripletExtraction as te
from os.path import normpath, join, dirname
dname = normpath(dirname(__file__))
doc = ds.Document()
doc.loadString(open(join(dname, 'triplet_extractor/vocabulary/pmc2556844.txt')).read())
ds.SentenceSplitter().splitNLTK(doc)
gtc = ds.GeniaTTC()
gtc.process(doc)
voc = te.Vocabulary()
voc.loadCompounds_file(join(dname, 'triplet_extractor/vocabulary/compounds.lst'))
voc.loadPredicates_files(activationFname=join(dname, 'triplet_extractor/vocabulary/activation.lst'),
activations_rotate=join(dname, 'triplet_extractor/vocabulary/activation_rotate.lst'),
inhibitionFname=join(dname, 'triplet_extractor/vocabulary/inhibition.lst'),
bindingFname=join(dname, 'triplet_extractor/vocabulary/binding.lst'),
activationFname_passive=join(dname, 'triplet_extractor/vocabulary/activation_pas.lst'),
inhibitionFname_passive=join(dname, 'triplet_extractor/vocabulary/inhibition_pas.lst'),
bindingFname_passive=join(dname, 'triplet_extractor/vocabulary/binding_pas.lst'))
ex = te.TripletExtractor(voc)
triplets = ex.extractTripletsNLP(doc, VP_CHECK_POS=1)
print 'Triplets found: ', len(triplets)
from os.path import normpath, join, dirname
import os
import codecs
from unidecode import unidecode
import nltk
from tagger import geniatagger
import pickle
MAX_WORDLEN = 300
PASSIVE_AUX_VERBS = set(['be', 'is', 'are', 'was', 'were', 'has been', 'have been', 'had been', 'get', 'got'])
ACTIVATE = 'activates'
BIND = 'binds'
INHIBIT = 'inhibits'
NP_delete = set(['biosynthesis inhibitor','signalling','signaling', 'homolog',
'signal','signals','pathway','pathways','induction','regulation', 'response',
'responses','responsive','inhibitor','inhibitors','activator','activators',
'producer','producers','plant','plants','mutant','mutants','mutation','mutations',
'line','lines'])
VP_delete = set(['may','might','can','could','would'])
sentence_delete = set([x.lower() for x in ['whether','To determine','To investigate','To study',
'one possibility','would be that','It was postulated']])
class Document(object):
def __init__(self, fname=None, docid='', utfdecode=False):
self.docid = docid
self.fname = fname
self.rawText = None
self.rawSentences = None
self.tokenizedSentences = None
self.taggedSentences = None
self.parsedSentences = None
if self.fname:
if not self.docid:
self.docid = self.fname
self.loadFile(self.fname, utfdecode=utfdecode)
#end
def loadFile(self, fname, utfdecode=False):
if utfdecode:
self.rawText = unidecode(codecs.open(fname, 'r', 'utf-8').read())
else:
self.rawText = codecs.open(fname, 'r', 'utf-8').read()
self.fname = fname
if not self.docid:
self.docid = self.fname
#end
def loadString(self, s):
self.rawText = s
def setID(self, s):
self.docid = s
#end class
class Corpus(list):
def loadFromDirectory(self, dirname, ftypes=['.txt']):
fnames = os.listdir(dirname)
for fname in fnames:
fullName = os.path.join(dirname, fname)
if os.path.splitext(fname)[1] in ftypes and os.path.isfile(fullName):
self.append(Document(fname=fullName))
#end
#end class
class Triplet(object):
def __init__(self, subject, predicate, object, sentence, document=None, sentenceNumber=None):
self.subject = subject
self.original_subject = subject
self.predicate = predicate
self.original_predicate = predicate
self.object = object
self.original_object = object
self.sentence = sentence
self.__sentenceNumber = sentenceNumber
self.document = document
self.passive_aux_verbs = []
self.passive = None
self.documentID = ''
if self.document:
self.setDocumentID(self.document.docid)
#end
def setDocumentID(self, did):
self.documentID = did
def __eq__(self, other):
if self.predicate == other.predicate:
if self.subject == other.subject and self.object == other.object:
#if self.subject == other.subject and self.object == other.object or \
#self.subject == other.object and self.object == other.subject:
return True
else:
return False
else:
return False
#end
def __cmp__(self, other):
if self.subject < other.subject:
return -1
elif self.subject > other.subject:
return 1
else:
if self.predicate < other.predicate:
return -1
elif self.predicate > other.predicate:
return 1
else:
if self.object < other.object:
return -1
elif self.object > other.object:
return 1
else:
return 0
#end
def __hash__(self):
return hash('%s%s%s' % (self.subject, self.predicate, self.object))
def __str__(self):
return '(%s, %s, %s)' % (self.subject, self.predicate, self.object)
#if self.passive_aux_verbs == []:
#return '(%s, %s, %s)' % (self.subject, self.predicate, self.object)
#else:
#return '(%s, [%s]%s, %s)' % (self.subject, self.passive_aux_verbs[0], self.predicate, self.object)
#end
def __repr__(self):
return self.__str__()
def printOriginal(self, fp=None):
if self.passive_aux_verbs == []:
s = '(%s, %s, %s)' % (self.original_subject, self.original_predicate, self.original_object)
#print '(%s, %s, %s)' % (self.original_subject, self.original_predicate, self.original_object)
else:
s = '(%s, %s%s, %s)' % (self.original_subject, str(self.passive_aux_verbs), self.original_predicate, self.original_object)
#print '(%s, %s%s, %s)' % (self.original_subject, str(self.passive_aux_verbs), self.original_predicate, self.original_object)
return s
#end
def getParsedSentence(self):
if self.__sentenceNumber and self.document:
return self.document.parsedSentences[self.__sentenceNumber]
else:
return None
#end
#end
class SentenceSplitter(object):
def splitNLTK(self, document):
assert(isinstance(document, Document))
###########
for ch in [' ', "'", ')', '\n', ',', '.', '[', ':', ';']:
document.rawText = document.rawText.replace(' et al' + ch, ' ETAL.')
document.rawText = document.rawText.replace('SA-treatment', 'SA treatment')
document.rawText = document.rawText.replace('H 2 O 2', 'H2O2')
###########
tokenizer = pickle.load(open(normpath(join(dirname(__file__),'punkt/english.pickle'))))
#tokenizer = pickle.load(open(os.path.normpath('punkt/english.pickle')))
document.rawSentences = [s.replace('\n', '') for s in tokenizer.tokenize(document.rawText)]
#end
def splitGENIA(self, document):
assert(isinstance(document, Document))
raise NotImplementedError
#end
#end class
class GeniaTTC(object):
'''GENIA tokenizer, tagger and chunker'''
def __init__(self, loadModels=True):
if loadModels:
cd = os.getcwd()
os.chdir(normpath(join(dirname(__file__),'tagger')))
geniatagger.load_models()
os.chdir(cd)
#end
def process(self, document):
assert(isinstance(document, Document))
tokenizedSentences = []
taggedSentences = []
parsedSentences = []
for sentence in document.rawSentences:
# shorten word longer than
words = sentence.split()
words = [w[:MAX_WORDLEN] for w in words]
sent = ''
for w in words:
sent += w + ' '
#print repr(sentence)
#print repr(sent)
#print
sentence = sent
tokenized = []
tokenizedTagged = []
parsed = []
if type(sentence) == unicode:
geniaResult = geniatagger.tag_sentence(sentence.encode('utf-8'))
else:
geniaResult = geniatagger.tag_sentence(sentence)
for wordTags in geniaResult:
word, base, POStag, chunktag, NEtag = wordTags[0], wordTags[1], wordTags[2], wordTags[3], wordTags[4]
tokenized.append(word)
tokenizedTagged.append((word, POStag))
parsed.append((word, POStag, chunktag))
#end
tokenizedSentences.append(tokenized)
taggedSentences.append(tokenizedTagged)
#parsedSentences.append(nltk.chunk.util.conlltags2tree(parsed))
parsedSentences.append(parsed)
#end
document.tokenizedSentences = tokenizedSentences
document.taggedSentences = taggedSentences
document.parsedSentences = parsedSentences
#end
def processSentence(self, sentence):
tokenized = []
tokenizedTagged = []
parsed = []
geniaResult = geniatagger.tag_sentence(sentence)
for wordTags in geniaResult:
word, base, POStag, chunktag, NEtag = wordTags[0], wordTags[1], wordTags[2], wordTags[3], wordTags[4]
tokenized.append(word)
tokenizedTagged.append((word, POStag))
parsed.append((word, POStag, chunktag))
#end
#return tokenized, tokenizedTagged, nltk.chunk.util.conlltags2tree(parsed)
return tokenized, tokenizedTagged, parsed
#end
def tokenizeSentence(self, sentence):
return self.processSentence(sentence)[0]
def tagSentence(self, sentence):
return self.processSentence(sentence)[1]
def parseSentence(self, sentence):
return self.processSentence(sentence)[2]
#end class
class Vertex(object):
def __init__(self, name, longName=''):
self.name = name
self.longName = longName
def __hash__(self):
return hash(self.name)
#end class
class Arc(object):
def __init__(self, start, end, name, sentence=''):
assert(isinstance(start, Vertex))
assert(isinstance(end, Vertex))
self.start = start
self.end = end
self.name = name
self.sentence = sentence
#end class
if __name__ == "__main__":
#a = Document(fname='/home/vid/programiranje/wingIDE_projects/dragana1/vid/testdoc2.txt')
#SentenceSplitter().splitNLTK(a)
#p = GeniaTTC().parseSentence('The generated ROS activate the production of ethylene, JA and SA.')
#t, tt, p = GeniaTTC().processSentence('The pathogen-inducible genes PR-1 , PR-2 , and PR-5 require SA signaling for activation, whereas the plant defensin gene PDF1.2 , along with a PR-3 and PR-4 gene, are induced by pathogens via an SA-independent and JA-dependent pathway.')
#t, tt, p = GeniaTTC().processSentence('In wild-type Col-0 plants, exogenous application of SA activated PR-1, whereas treatment with methyl jasmonate (MeJA) resulted in the accumulation of LOX2 , VSP , and PDF1.2 mRNA ( Figure 3 ).')
t, tt, p = GeniaTTC().processSentence('Treatment with methyl jasmonate (MeJA) resulted in the accumulation of LOX2 , VSP , and PDF1.2 mRNA ( Figure 3 ).')
a = nltk.chunk.util.conlltags2tree(p)
d = Document()
d.loadString('Treatment with methyl jasmonate (MeJA) resulted in the accumulation of LOX2 , VSP , and PDF1.2 mRNA ( Figure 3 ). A sorghum homolog of the AIM1 gene from Arabidopsis, encoding a fatty acid oxidase ( Richmond and Bleecker, 1999 ), an acyl-CoA synthetase, and an acyl-CoA oxidase, were also confirmed by qRT-PCR to be induced by both SA and MeJA. This is a test.')
ss = SentenceSplitter()
ss.splitNLTK(d)
print d.tokenizedSentences
#A sorghum homolog of the AIM1 gene from Arabidopsis, encoding a fatty acid oxidase ( Richmond and Bleecker, 1999 ), an acyl-CoA synthetase, and an acyl-CoA oxidase, were also confirmed by qRT-PCR to be induced by both SA and MeJA.
\ No newline at end of file
import os
import StringIO
import networkx as nx
OPTLINE = '#'
CANVAS = '_canvas'
ATTRS = '_attributes'
DB = '_database'
ARCREL = 'relation'
LINECOLOR = 'linecolor'
BLACK = '0/0/0'
RED = '255/0/0'
GREEN = '0/255/0'
BLUE = '0/0/255'
PINK = '255/0/255'
def load_BMG_to_networkx(data):
fp = StringIO.StringIO()
if os.path.exists(data):
fp.write(open(data).read())
else:
fp.write(data)
fp.flush()
fp.seek(0)
#assert(os.path.exists(fname))
#fp = open(fname)
lines = fp.readlines()
fp.close()
#print lines
graph = nx.MultiDiGraph()
for line in lines:
line = line.strip()
if not line:
continue
elts = line.split()
if elts[0] == OPTLINE:
if elts[1] == CANVAS:
continue
elif elts[1] == ATTRS:
node = elts[2]
attrs = elts[3:]
if not graph.has_node(node):
continue