Skip to content
GitLab
Projects
Groups
Snippets
Help
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
C
clowdflows
Project overview
Project overview
Details
Activity
Releases
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Issues
0
Issues
0
List
Boards
Labels
Service Desk
Milestones
Merge Requests
0
Merge Requests
0
Operations
Operations
Incidents
Analytics
Analytics
Repository
Value Stream
Wiki
Wiki
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Create a new issue
Commits
Issue Boards
Open sidebar
Alain Shakour
clowdflows
Commits
7b924aa6
Commit
7b924aa6
authored
Jun 17, 2013
by
vpodpecan
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
- new segmine widgets added
- safe string evaluation widget added
parent
0c7d85bd
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
310 additions
and
6 deletions
+310
-6
workflows/base/library.py
workflows/base/library.py
+13
-1
workflows/segmine/constants.py
workflows/segmine/constants.py
+9
-0
workflows/segmine/library.py
workflows/segmine/library.py
+288
-5
No files found.
workflows/base/library.py
View file @
7b924aa6
...
...
@@ -111,4 +111,16 @@ def stopwatch(input_dict):
output_dict
[
'time_out'
]
=
now
output_dict
[
'time_span'
]
=
elapsedTime
return
output_dict
\ No newline at end of file
return
output_dict
def
base_safe_eval_string
(
input_dict
):
import
ast
sdata
=
str
(
input_dict
[
'data'
])
try
:
result
=
ast
.
literal_eval
(
sdata
)
except
Exception
:
raise
Exception
(
'Cannot evaluate string (remember, for safety reasons only literal structures can be evaluated: strings, numbers, tuples, lists, dicts, booleans, and None)'
)
else
:
return
{
'evaluation_result'
:
result
}
#end
\ No newline at end of file
workflows/segmine/constants.py
0 → 100644
View file @
7b924aa6
IGNORE_MISSING
=
'ignore'
IMPUTE_MISSING
=
'impute row'
ENTREZ_GENE_PREFIX
=
'Entrez_Gene'
DEFAULT_CONTROL_GROUP_ID
=
'1'
CONTROL_GROUP_KEY
=
'control group'
DATA_GROUP_KEY
=
'data group'
CLASS_ATRR_NAME
=
'group'
workflows/segmine/library.py
View file @
7b924aa6
...
...
@@ -47,7 +47,7 @@ def segmine_fc_gene_filter_finished(postdata, input_dict, output_dict):
targets
=
map
(
str
,
postdata
.
get
(
'target%s'
%
widget_id
))
ranker
=
rankers
.
ExpressionSignificance_FoldChange
(
dataset
,
False
)
ranks
=
ranker
(
target
=
targets
if
len
(
targets
)
>
1
else
targets
[
0
])
new_domain
=
orange
.
Domain
([
att
for
att
,
fc
in
ranks
if
fc
>=
fc_threshold
],
new_domain
=
orange
.
Domain
([
att
for
att
,
fc
in
ranks
if
fc
>=
fc_threshold
],
dataset
.
domain
.
classVar
)
reduced_dataset
=
orange
.
ExampleTable
(
new_domain
,
dataset
)
return
{
'dataset'
:
reduced_dataset
}
...
...
@@ -72,16 +72,18 @@ def segmine_ttest_gene_filter_finished(postdata, input_dict, output_dict):
def
segmine_ttest_gene_filter
(
input_dict
):
return
{
'dataset'
:
None
}
def
segmine_fc_gene_filter
(
input_dict
):
return
{
'dataset'
:
None
}
def
segmine_gene_ranker
(
input_dict
,
widget
):
import
orange
from
numpy
import
mean
,
var
from
math
import
sqrt
,
floor
CONTROL_GROUP_KEY
=
'control group'
DATA_GROUP_KEY
=
'data group'
CLASS_ATRR_NAME
=
'group'
CLASS_ATRR_NAME
=
'group'
table
=
input_dict
[
'microarrayTable'
]
k
=
int
(
input_dict
[
'k'
])
m
=
int
(
input_dict
[
'm'
])
...
...
@@ -171,7 +173,7 @@ def segmine_biomine_neighbourhood(input_dict):
startNodes
=
input_dict
.
get
(
'startNodes'
,
None
)
databaseVersion
=
input_dict
.
get
(
'databaseVersion'
)
search
=
BiomineSearch
(
groupNodes
=
groupNodes
,
search
=
BiomineSearch
(
groupNodes
=
groupNodes
,
singleComponent
=
singleComponent
,
maxNodes
=
maxNodes
,
startNodes
=
startNodes
,
...
...
@@ -187,7 +189,7 @@ def segmine_biomine_connection(input_dict):
endNodes
=
input_dict
.
get
(
'endNodes'
,
None
)
databaseVersion
=
input_dict
.
get
(
'databaseVersion'
)
search
=
BiomineSearch
(
groupNodes
=
groupNodes
,
search
=
BiomineSearch
(
groupNodes
=
groupNodes
,
singleComponent
=
singleComponent
,
maxNodes
=
maxNodes
,
startNodes
=
startNodes
,
...
...
@@ -203,7 +205,7 @@ def segmine_biomine_medoid(input_dict):
startNodes
=
input_dict
.
get
(
'startNodes'
,
None
)
databaseVersion
=
input_dict
.
get
(
'databaseVersion'
)
search
=
BiomineSearch
(
groupNodes
=
groupNodes
,
search
=
BiomineSearch
(
groupNodes
=
groupNodes
,
singleComponent
=
singleComponent
,
maxNodes
=
maxNodes
,
medoids
=
True
,
...
...
@@ -212,3 +214,284 @@ def segmine_biomine_medoid(input_dict):
result
,
bestPath
=
search
.
invokeBiomine
()
return
{
'result'
:
result
,
'bestPath'
:
bestPath
}
def
segmine_mirna_to_gene_tarbase
(
input_dict
):
import
cPickle
from
os.path
import
normpath
,
join
,
dirname
mirna_ranks
=
input_dict
[
'mirna_ranks'
]
mirna2gene
=
cPickle
.
load
(
open
(
normpath
(
join
(
dirname
(
__file__
),
'data/mirna2gene_tarbase'
)),
'rb'
))
result
=
{}
unknown
=
0
for
(
rna
,
rank
)
in
mirna_ranks
:
rna
=
rna
.
lower
()
if
rna
not
in
mirna2gene
:
unknown
+=
1
continue
for
gene
in
mirna2gene
[
rna
]:
if
gene
not
in
result
:
result
[
gene
]
=
rank
else
:
result
[
gene
]
+=
rank
#end
# if unknown:
# self.warning('%d unknown miRNA were found and ignored!' % unknown)
result
=
sorted
([(
pair
[
1
],
pair
[
0
])
for
pair
in
result
.
items
()],
reverse
=
True
)
result
=
[(
str
(
pair
[
1
]),
pair
[
0
])
for
pair
in
result
]
return
{
'gene_ranks'
:
result
}
#end
def
segmine_mirna_to_gene_targetscan
(
input_dict
):
import
cPickle
from
os.path
import
normpath
,
join
,
dirname
mirna_ranks
=
input_dict
[
'mirna_ranks'
]
mirna2gene
=
cPickle
.
load
(
open
(
normpath
(
join
(
dirname
(
__file__
),
'data/mirna2gene_targetscan'
)),
'rb'
))
result
=
{}
unknown
=
0
for
(
rna
,
rank
)
in
mirna_ranks
:
rna
=
rna
.
lower
()
if
rna
not
in
mirna2gene
:
unknown
+=
1
continue
for
gene
in
mirna2gene
[
rna
]:
if
gene
not
in
result
:
result
[
gene
]
=
rank
else
:
result
[
gene
]
+=
rank
#end
# if unknown:
# self.warning('%d unknown miRNA were found and ignored!' % unknown)
result
=
sorted
([(
pair
[
1
],
pair
[
0
])
for
pair
in
result
.
items
()],
reverse
=
True
)
result
=
[(
str
(
pair
[
1
]),
pair
[
0
])
for
pair
in
result
]
return
{
'gene_ranks'
:
result
}
#end
def
__makeExampleTable
(
namesDict
,
data
):
import
orange
from
constants
import
CLASS_ATRR_NAME
,
CONTROL_GROUP_KEY
,
DATA_GROUP_KEY
geneIDs
=
sorted
(
data
.
keys
())
attrList
=
[
orange
.
FloatVariable
(
name
=
str
(
geneID
))
for
geneID
in
geneIDs
]
classAttr
=
orange
.
EnumVariable
(
name
=
CLASS_ATRR_NAME
,
values
=
[
CONTROL_GROUP_KEY
,
DATA_GROUP_KEY
])
domain
=
orange
.
Domain
(
attrList
,
classAttr
)
table
=
orange
.
ExampleTable
(
domain
)
# first half: group 1
for
attrName
in
namesDict
[
CONTROL_GROUP_KEY
].
keys
():
exampleValues
=
[
data
[
geneID
][
CONTROL_GROUP_KEY
][
attrName
]
for
geneID
in
geneIDs
]
+
[
CONTROL_GROUP_KEY
]
example
=
orange
.
Example
(
domain
,
exampleValues
)
table
.
append
(
example
)
# second half: group 2
for
attrName
in
namesDict
[
DATA_GROUP_KEY
].
keys
():
exampleValues
=
[
data
[
geneID
][
DATA_GROUP_KEY
][
attrName
]
for
geneID
in
geneIDs
]
+
[
DATA_GROUP_KEY
]
example
=
orange
.
Example
(
domain
,
exampleValues
)
table
.
append
(
example
)
return
table
#end
def
segmine_read_microarray_data
(
input_dict
):
from
numpy
import
mean
import
math
from
constants
import
CLASS_ATRR_NAME
,
CONTROL_GROUP_KEY
,
DATA_GROUP_KEY
,
DEFAULT_CONTROL_GROUP_ID
data
=
open
(
input_dict
[
'file'
]).
read
()
dataFormat
=
'linear'
if
int
(
input_dict
[
'idf'
])
==
1
else
'log2'
calcMethod
=
'ratio'
if
int
(
input_dict
[
'cm'
])
==
1
else
'difference'
lines
=
[
x
.
replace
(
','
,
' '
).
split
()
for
x
in
data
.
splitlines
()]
names
=
lines
[
0
][
1
:]
# skip name of gene column
# find the prefix of the data channel (the first group prefix is fixed in advance)
pfs
=
set
()
for
name
in
names
:
pfs
.
add
(
name
[
0
])
if
len
(
pfs
)
!=
2
:
raise
ValueError
(
'Invalid data header: more than two prefixes found: %s'
%
str
(
list
(
pfs
)))
# if the data do not obey the default rule, the first character of the first column
# is the identifier of the first group
if
DEFAULT_CONTROL_GROUP_ID
not
in
pfs
:
CONTROL_GROUP_ID
=
names
[
0
][
0
]
else
:
CONTROL_GROUP_ID
=
DEFAULT_CONTROL_GROUP_ID
pfs
.
remove
(
CONTROL_GROUP_ID
)
DATA_GROUP_ID
=
list
(
pfs
)[
0
]
# collect positions of column names for both groups
firstGroupNames
=
[]
secondGroupNames
=
[]
for
name
in
names
:
if
name
.
startswith
(
CONTROL_GROUP_ID
):
firstGroupNames
.
append
(
name
)
elif
name
.
startswith
(
DATA_GROUP_ID
):
secondGroupNames
.
append
(
name
)
#end
controlGroupNames
=
firstGroupNames
dataGroupNames
=
secondGroupNames
# collect positions of column names for both groups
controlGroupNames
=
dict
.
fromkeys
(
controlGroupNames
)
dataGroupNames
=
dict
.
fromkeys
(
dataGroupNames
)
for
name
in
controlGroupNames
:
controlGroupNames
[
name
]
=
names
.
index
(
name
)
for
name
in
dataGroupNames
:
dataGroupNames
[
name
]
=
names
.
index
(
name
)
# parse and store the actual data
# read values
data
=
{}
ndup
=
0
ln
=
0
#refresh = (len(self.lines)-1) / 10
#self.progressBar = ProgressBar(self, iterations=25)
for
elts
in
lines
[
1
:]:
ln
+=
1
#if ln%refresh == 0:
#self.progressBar.advance()
if
len
(
elts
)
!=
len
(
names
)
+
1
:
# EntrezID is the first value
raise
ValueError
(
'Wrong number of values, line: %d'
%
ln
)
try
:
geneID
=
str
(
elts
[
0
])
vals
=
[
float
(
x
)
for
x
in
elts
[
1
:]]
except
Exception
,
e
:
raise
ValueError
(
'Error while reading values, line: %d'
%
ln
)
else
:
if
data
.
has_key
(
geneID
):
ndup
+=
1
else
:
# init storage
data
[
geneID
]
=
{}
data
[
geneID
][
CONTROL_GROUP_KEY
]
=
{}
data
[
geneID
][
DATA_GROUP_KEY
]
=
{}
for
atrName
in
controlGroupNames
.
keys
():
data
[
geneID
][
CONTROL_GROUP_KEY
][
atrName
]
=
[]
for
atrName
in
dataGroupNames
.
keys
():
data
[
geneID
][
DATA_GROUP_KEY
][
atrName
]
=
[]
# get values for first group of columns
for
(
name
,
index
)
in
controlGroupNames
.
items
():
data
[
geneID
][
CONTROL_GROUP_KEY
][
name
].
append
(
vals
[
index
])
# get values for second group of columns
for
(
name
,
index
)
in
dataGroupNames
.
items
():
data
[
geneID
][
DATA_GROUP_KEY
][
name
].
append
(
vals
[
index
])
#end else
#endfor
## merge duplicates by averaging
for
geneID
in
data
.
keys
():
for
atrName
in
data
[
geneID
][
CONTROL_GROUP_KEY
].
keys
():
values
=
data
[
geneID
][
CONTROL_GROUP_KEY
][
atrName
]
data
[
geneID
][
CONTROL_GROUP_KEY
][
atrName
]
=
sum
(
values
)
/
float
(
len
(
values
))
for
atrName
in
data
[
geneID
][
DATA_GROUP_KEY
].
keys
():
values
=
data
[
geneID
][
DATA_GROUP_KEY
][
atrName
]
data
[
geneID
][
DATA_GROUP_KEY
][
atrName
]
=
sum
(
values
)
/
float
(
len
(
values
))
## merge duplicates by averaging
#if self.ui.meanRadioButton.isChecked():
#for geneID in data.keys():
#for atrName in data[geneID][CONTROL_GROUP_KEY].keys():
#values = data[geneID][CONTROL_GROUP_KEY][atrName]
#data[geneID][CONTROL_GROUP_KEY][atrName] = sum(values) / float(len(values))
#for atrName in data[geneID][DATA_GROUP_KEY].keys():
#values = data[geneID][DATA_GROUP_KEY][atrName]
#data[geneID][DATA_GROUP_KEY][atrName] = sum(values) / float(len(values))
## merge duplicates by median
#elif self.ui.medianRadioButton.isChecked():
#for geneID in data.keys():
#for atrName in data[geneID][CONTROL_GROUP_KEY].keys():
#values = data[geneID][CONTROL_GROUP_KEY][atrName]
#data[geneID][CONTROL_GROUP_KEY][atrName] = median(values)
#for atrName in data[geneID][DATA_GROUP_KEY].keys():
#values = data[geneID][DATA_GROUP_KEY][atrName]
#data[geneID][DATA_GROUP_KEY][atrName] = median(values)
## take one duplicate at random
#elif self.ui.randomRadioButton.isChecked():
#for geneID in data.keys():
#for atrName in data[geneID][CONTROL_GROUP_KEY].keys():
#values = data[geneID][CONTROL_GROUP_KEY][atrName]
#data[geneID][CONTROL_GROUP_KEY][atrName] = choice(values)
#for atrName in data[geneID][DATA_GROUP_KEY].keys():
#values = data[geneID][DATA_GROUP_KEY][atrName]
#data[geneID][DATA_GROUP_KEY][atrName] = choice(values)
##end
namesDict
=
{
CONTROL_GROUP_KEY
:
controlGroupNames
,
DATA_GROUP_KEY
:
dataGroupNames
}
table
=
__makeExampleTable
(
namesDict
,
data
)
logFCs
=
{}
if
calcMethod
==
'ratio'
:
if
dataFormat
==
'log2'
:
# log2 data have to be transformed for ratio computation
for
geneID
in
data
.
keys
():
for
attrName
in
namesDict
[
CONTROL_GROUP_KEY
]:
data
[
geneID
][
CONTROL_GROUP_KEY
][
attrName
]
=
math
.
pow
(
2
,
data
[
geneID
][
CONTROL_GROUP_KEY
][
attrName
])
for
attrName
in
namesDict
[
DATA_GROUP_KEY
]:
data
[
geneID
][
DATA_GROUP_KEY
][
attrName
]
=
math
.
pow
(
2
,
data
[
geneID
][
DATA_GROUP_KEY
][
attrName
])
for
geneID
in
data
.
keys
():
control_array
=
[
data
[
geneID
][
CONTROL_GROUP_KEY
][
attrName
]
for
attrName
in
namesDict
[
CONTROL_GROUP_KEY
]]
data_array
=
[
data
[
geneID
][
DATA_GROUP_KEY
][
attrName
]
for
attrName
in
namesDict
[
DATA_GROUP_KEY
]]
numerator
=
mean
(
data_array
)
denumerator
=
mean
(
control_array
)
if
numerator
<
0
or
denumerator
<
0
:
print
'Invalid values, gene %s'
%
str
(
geneID
)
continue
logFCs
[
geneID
]
=
numerator
/
denumerator
# for those less than 1 invert and give negative sign
if
logFCs
[
geneID
]
<
1
:
logFCs
[
geneID
]
=
-
1.0
/
logFCs
[
geneID
]
else
:
# difference
if
dataFormat
==
'linear'
:
# linear data have to be transformed for log2 difference computation
for
geneID
in
data
.
keys
():
for
attrName
in
namesDict
[
CONTROL_GROUP_KEY
]:
if
data
[
geneID
][
CONTROL_GROUP_KEY
][
attrName
]
<=
0
:
raise
ValueError
(
'Cannot transform linear data to log2: value is <= 0 for gene %s'
%
str
(
geneID
))
else
:
data
[
geneID
][
CONTROL_GROUP_KEY
][
attrName
]
=
math
.
log
(
data
[
geneID
][
CONTROL_GROUP_KEY
][
attrName
],
2
)
for
attrName
in
namesDict
[
DATA_GROUP_KEY
]:
if
data
[
geneID
][
DATA_GROUP_KEY
][
attrName
]
<=
0
:
raise
ValueError
(
'Cannot transform linear data to log2: value is <= 0 for gene %s'
%
str
(
geneID
))
else
:
data
[
geneID
][
DATA_GROUP_KEY
][
attrName
]
=
math
.
log
(
data
[
geneID
][
DATA_GROUP_KEY
][
attrName
],
2
)
for
geneID
in
data
.
keys
():
control_array
=
[
data
[
geneID
][
CONTROL_GROUP_KEY
][
attrName
]
for
attrName
in
namesDict
[
CONTROL_GROUP_KEY
]]
data_array
=
[
data
[
geneID
][
DATA_GROUP_KEY
][
attrName
]
for
attrName
in
namesDict
[
DATA_GROUP_KEY
]]
logFCs
[
geneID
]
=
mean
(
data_array
)
-
mean
(
control_array
)
#end
sortedLogFCs
=
[(
elt
[
1
],
elt
[
0
])
for
elt
in
sorted
([(
logFCs
[
geneID
],
geneID
)
for
geneID
in
data
.
keys
()],
reverse
=
True
)]
return
{
'table'
:
table
,
'fold_change'
:
sortedLogFCs
}
#end
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment