Commit 2b9919a3 authored by Janez K's avatar Janez K
Browse files

Merge branch 'dev' of source.ijs.si:mothra into dev

parents 36bd4b9a d0a8c968
This diff is collapsed.
......@@ -112,3 +112,15 @@ def stopwatch(input_dict):
output_dict['time_span'] = elapsedTime
return output_dict
def base_safe_eval_string(input_dict):
import ast
sdata = str(input_dict['data'])
try:
result = ast.literal_eval(sdata)
except Exception:
raise Exception('Cannot evaluate string (remember, for safety reasons only literal structures can be evaluated: strings, numbers, tuples, lists, dicts, booleans, and None)')
else:
return {'evaluation_result': result}
#end
\ No newline at end of file
IGNORE_MISSING = 'ignore'
IMPUTE_MISSING = 'impute row'
ENTREZ_GENE_PREFIX = 'Entrez_Gene'
DEFAULT_CONTROL_GROUP_ID = '1'
CONTROL_GROUP_KEY = 'control group'
DATA_GROUP_KEY = 'data group'
CLASS_ATRR_NAME = 'group'
This diff is collapsed.
......@@ -72,9 +72,11 @@ def segmine_ttest_gene_filter_finished(postdata, input_dict, output_dict):
def segmine_ttest_gene_filter(input_dict):
return {'dataset' : None}
def segmine_fc_gene_filter(input_dict):
return {'dataset' : None}
def segmine_gene_ranker(input_dict, widget):
import orange
from numpy import mean, var
......@@ -212,3 +214,284 @@ def segmine_biomine_medoid(input_dict):
result, bestPath = search.invokeBiomine()
return {'result' : result, 'bestPath' : bestPath}
def segmine_mirna_to_gene_tarbase(input_dict):
import cPickle
from os.path import normpath, join, dirname
mirna_ranks = input_dict['mirna_ranks']
mirna2gene = cPickle.load(open(normpath(join(dirname(__file__), 'data/mirna2gene_tarbase')),'rb'))
result = {}
unknown = 0
for (rna, rank) in mirna_ranks:
rna = rna.lower()
if rna not in mirna2gene:
unknown += 1
continue
for gene in mirna2gene[rna]:
if gene not in result:
result[gene] = rank
else:
result[gene] += rank
#end
# if unknown:
# self.warning('%d unknown miRNA were found and ignored!' % unknown)
result = sorted([(pair[1], pair[0]) for pair in result.items()], reverse=True)
result = [(str(pair[1]), pair[0]) for pair in result]
return {'gene_ranks': result}
#end
def segmine_mirna_to_gene_targetscan(input_dict):
import cPickle
from os.path import normpath, join, dirname
mirna_ranks = input_dict['mirna_ranks']
mirna2gene = cPickle.load(open(normpath(join(dirname(__file__), 'data/mirna2gene_targetscan')),'rb'))
result = {}
unknown = 0
for (rna, rank) in mirna_ranks:
rna = rna.lower()
if rna not in mirna2gene:
unknown += 1
continue
for gene in mirna2gene[rna]:
if gene not in result:
result[gene] = rank
else:
result[gene] += rank
#end
# if unknown:
# self.warning('%d unknown miRNA were found and ignored!' % unknown)
result = sorted([(pair[1], pair[0]) for pair in result.items()], reverse=True)
result = [(str(pair[1]), pair[0]) for pair in result]
return {'gene_ranks': result}
#end
def __makeExampleTable(namesDict, data):
import orange
from constants import CLASS_ATRR_NAME, CONTROL_GROUP_KEY, DATA_GROUP_KEY
geneIDs = sorted(data.keys())
attrList = [orange.FloatVariable(name=str(geneID)) for geneID in geneIDs]
classAttr = orange.EnumVariable(name=CLASS_ATRR_NAME, values = [CONTROL_GROUP_KEY, DATA_GROUP_KEY])
domain = orange.Domain(attrList, classAttr)
table = orange.ExampleTable(domain)
# first half: group 1
for attrName in namesDict[CONTROL_GROUP_KEY].keys():
exampleValues = [data[geneID][CONTROL_GROUP_KEY][attrName] for geneID in geneIDs] + [CONTROL_GROUP_KEY]
example = orange.Example(domain, exampleValues)
table.append(example)
# second half: group 2
for attrName in namesDict[DATA_GROUP_KEY].keys():
exampleValues = [data[geneID][DATA_GROUP_KEY][attrName] for geneID in geneIDs] + [DATA_GROUP_KEY]
example = orange.Example(domain, exampleValues)
table.append(example)
return table
#end
def segmine_read_microarray_data(input_dict):
from numpy import mean
import math
from constants import CLASS_ATRR_NAME, CONTROL_GROUP_KEY, DATA_GROUP_KEY, DEFAULT_CONTROL_GROUP_ID
data = open(input_dict['file']).read()
dataFormat = 'linear' if int(input_dict['idf']) == 1 else 'log2'
calcMethod = 'ratio' if int(input_dict['cm']) == 1 else 'difference'
lines = [x.replace(',', ' ').split() for x in data.splitlines()]
names = lines[0][1:] # skip name of gene column
# find the prefix of the data channel (the first group prefix is fixed in advance)
pfs = set()
for name in names:
pfs.add(name[0])
if len(pfs) != 2:
raise ValueError('Invalid data header: more than two prefixes found: %s' % str(list(pfs)))
# if the data do not obey the default rule, the first character of the first column
# is the identifier of the first group
if DEFAULT_CONTROL_GROUP_ID not in pfs:
CONTROL_GROUP_ID = names[0][0]
else:
CONTROL_GROUP_ID = DEFAULT_CONTROL_GROUP_ID
pfs.remove(CONTROL_GROUP_ID)
DATA_GROUP_ID = list(pfs)[0]
# collect positions of column names for both groups
firstGroupNames = []
secondGroupNames = []
for name in names:
if name.startswith(CONTROL_GROUP_ID):
firstGroupNames.append(name)
elif name.startswith(DATA_GROUP_ID):
secondGroupNames.append(name)
#end
controlGroupNames = firstGroupNames
dataGroupNames = secondGroupNames
# collect positions of column names for both groups
controlGroupNames = dict.fromkeys(controlGroupNames)
dataGroupNames = dict.fromkeys(dataGroupNames)
for name in controlGroupNames:
controlGroupNames[name] = names.index(name)
for name in dataGroupNames:
dataGroupNames[name] = names.index(name)
# parse and store the actual data
# read values
data = {}
ndup = 0
ln = 0
#refresh = (len(self.lines)-1) / 10
#self.progressBar = ProgressBar(self, iterations=25)
for elts in lines[1:]:
ln += 1
#if ln%refresh == 0:
#self.progressBar.advance()
if len(elts) != len(names)+1: # EntrezID is the first value
raise ValueError('Wrong number of values, line: %d' % ln)
try:
geneID = str(elts[0])
vals = [float(x) for x in elts[1:]]
except Exception, e:
raise ValueError('Error while reading values, line: %d' % ln)
else:
if data.has_key(geneID):
ndup += 1
else:
# init storage
data[geneID] = {}
data[geneID][CONTROL_GROUP_KEY] = {}
data[geneID][DATA_GROUP_KEY] = {}
for atrName in controlGroupNames.keys():
data[geneID][CONTROL_GROUP_KEY][atrName] = []
for atrName in dataGroupNames.keys():
data[geneID][DATA_GROUP_KEY][atrName] = []
# get values for first group of columns
for (name, index) in controlGroupNames.items():
data[geneID][CONTROL_GROUP_KEY][name].append(vals[index])
# get values for second group of columns
for (name, index) in dataGroupNames.items():
data[geneID][DATA_GROUP_KEY][name].append(vals[index])
#end else
#endfor
## merge duplicates by averaging
for geneID in data.keys():
for atrName in data[geneID][CONTROL_GROUP_KEY].keys():
values = data[geneID][CONTROL_GROUP_KEY][atrName]
data[geneID][CONTROL_GROUP_KEY][atrName] = sum(values) / float(len(values))
for atrName in data[geneID][DATA_GROUP_KEY].keys():
values = data[geneID][DATA_GROUP_KEY][atrName]
data[geneID][DATA_GROUP_KEY][atrName] = sum(values) / float(len(values))
## merge duplicates by averaging
#if self.ui.meanRadioButton.isChecked():
#for geneID in data.keys():
#for atrName in data[geneID][CONTROL_GROUP_KEY].keys():
#values = data[geneID][CONTROL_GROUP_KEY][atrName]
#data[geneID][CONTROL_GROUP_KEY][atrName] = sum(values) / float(len(values))
#for atrName in data[geneID][DATA_GROUP_KEY].keys():
#values = data[geneID][DATA_GROUP_KEY][atrName]
#data[geneID][DATA_GROUP_KEY][atrName] = sum(values) / float(len(values))
## merge duplicates by median
#elif self.ui.medianRadioButton.isChecked():
#for geneID in data.keys():
#for atrName in data[geneID][CONTROL_GROUP_KEY].keys():
#values = data[geneID][CONTROL_GROUP_KEY][atrName]
#data[geneID][CONTROL_GROUP_KEY][atrName] = median(values)
#for atrName in data[geneID][DATA_GROUP_KEY].keys():
#values = data[geneID][DATA_GROUP_KEY][atrName]
#data[geneID][DATA_GROUP_KEY][atrName] = median(values)
## take one duplicate at random
#elif self.ui.randomRadioButton.isChecked():
#for geneID in data.keys():
#for atrName in data[geneID][CONTROL_GROUP_KEY].keys():
#values = data[geneID][CONTROL_GROUP_KEY][atrName]
#data[geneID][CONTROL_GROUP_KEY][atrName] = choice(values)
#for atrName in data[geneID][DATA_GROUP_KEY].keys():
#values = data[geneID][DATA_GROUP_KEY][atrName]
#data[geneID][DATA_GROUP_KEY][atrName] = choice(values)
##end
namesDict = {CONTROL_GROUP_KEY: controlGroupNames, DATA_GROUP_KEY: dataGroupNames}
table = __makeExampleTable(namesDict, data)
logFCs = {}
if calcMethod == 'ratio':
if dataFormat == 'log2': # log2 data have to be transformed for ratio computation
for geneID in data.keys():
for attrName in namesDict[CONTROL_GROUP_KEY]:
data[geneID][CONTROL_GROUP_KEY][attrName] = math.pow(2, data[geneID][CONTROL_GROUP_KEY][attrName])
for attrName in namesDict[DATA_GROUP_KEY]:
data[geneID][DATA_GROUP_KEY][attrName] = math.pow(2, data[geneID][DATA_GROUP_KEY][attrName])
for geneID in data.keys():
control_array = [data[geneID][CONTROL_GROUP_KEY][attrName] for attrName in namesDict[CONTROL_GROUP_KEY]]
data_array = [data[geneID][DATA_GROUP_KEY][attrName] for attrName in namesDict[DATA_GROUP_KEY]]
numerator = mean(data_array)
denumerator = mean(control_array)
if numerator < 0 or denumerator < 0:
print 'Invalid values, gene %s' % str(geneID)
continue
logFCs[geneID] = numerator / denumerator
# for those less than 1 invert and give negative sign
if logFCs[geneID] < 1:
logFCs[geneID] = -1.0 / logFCs[geneID]
else:
# difference
if dataFormat == 'linear': # linear data have to be transformed for log2 difference computation
for geneID in data.keys():
for attrName in namesDict[CONTROL_GROUP_KEY]:
if data[geneID][CONTROL_GROUP_KEY][attrName] <= 0:
raise ValueError('Cannot transform linear data to log2: value is <= 0 for gene %s' % str(geneID))
else:
data[geneID][CONTROL_GROUP_KEY][attrName] = math.log(data[geneID][CONTROL_GROUP_KEY][attrName], 2)
for attrName in namesDict[DATA_GROUP_KEY]:
if data[geneID][DATA_GROUP_KEY][attrName] <= 0:
raise ValueError('Cannot transform linear data to log2: value is <= 0 for gene %s' % str(geneID))
else:
data[geneID][DATA_GROUP_KEY][attrName] = math.log(data[geneID][DATA_GROUP_KEY][attrName], 2)
for geneID in data.keys():
control_array = [data[geneID][CONTROL_GROUP_KEY][attrName] for attrName in namesDict[CONTROL_GROUP_KEY]]
data_array = [data[geneID][DATA_GROUP_KEY][attrName] for attrName in namesDict[DATA_GROUP_KEY]]
logFCs[geneID] = mean(data_array) - mean(control_array)
#end
sortedLogFCs = [(elt[1], elt[0]) for elt in sorted([(logFCs[geneID], geneID) for geneID in data.keys()], reverse=True)]
return {'table': table, 'fold_change': sortedLogFCs}
#end
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment