Commit c40f4c76 authored by Izidorf's avatar Izidorf

bacon

parent 9f42806f
# naive bayes
# from sklearn import datasets
# iris = datasets.load_iris()
# from sklearn.naive_bayes import GaussianNB
# gnb = GaussianNB()
# y_pred = gnb.fit(iris.data, iris.target).predict(iris.data)
# print "Number of mislabeled points : %d" % (iris.target != y_pred).sum()
#print iris
#print iris.data
# print iris.target
# this code converts data from the csv file into scikit learn dataset and returns it as a tuple
def getDataFromCSV():
from numpy import genfromtxt
my_data = genfromtxt('iris.csv', delimiter=',')
n_sample = []
n_feature = []
attributes = []
for x in my_data:
n_feature.append(x[-1])
n_sample.append(x[:-1])
dataset = (n_sample, n_feature, attributes)
dictionary={"data":dataset}
return dictionary
def naiveBayes(dictionary):
dataset = dictionary["data"]
n_sample = dataset[0]
n_feature = dataset[1]
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(n_sample, n_feature).predict(n_sample)
print "Expected Classification"
print n_feature
print "Predicted Classification"
print y_pred
print "Number of mislabeled points : %d" % (n_feature != y_pred).sum()
#data = getDataFromCSV()
#print data
# ********************* ************************* ***********************
# GENERATE DECISION TREE PDF
# def decisionTreeJ48():
# from sklearn.datasets import load_iris
# from sklearn import tree
# iris = load_iris()
# clf = tree.DecisionTreeClassifier()
# clf = clf.fit(iris.data, iris.target)
# from StringIO import StringIO
# out = StringIO()
# out = tree.export_graphviz(clf, out_file=out)
# print out.getvalue()
# import StringIO, pydot
# dot_data = StringIO.StringIO()
# tree.export_graphviz(clf, out_file=dot_data)
# graph = pydot.graph_from_dot_data(dot_data.getvalue())
# graph.write_pdf("iris.pdf")
# #graph_from_dot_data(out.getvalue()).write_pdf("iris.pdf")
# decisionTreeJ48()
# GENERATE DECISION TREE PNG
def decisionTreeJ48(dictionary):
dataset = dictionary["data"]
n_sample = dataset[0]
n_feature = dataset[1]
from sklearn import tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(n_sample, n_feature)
from StringIO import StringIO
out = StringIO()
out = tree.export_graphviz(clf, out_file=out)
import StringIO, pydot
from os import system
dot_data = StringIO.StringIO()
dotfile = open("decisionTreeJ48-scikit.dot", 'w')
dotfile = tree.export_graphviz(clf, out_file=dotfile)
dotfile.close()
system("dot -Tpng decisionTreeJ48-scikit.dot -o decisionTreeJ48-scikit.png")
# data = getDataFromCSV()
# decisionTreeJ48(data)
def ScikitDatasetToCSV(dictionary):
dataset = dictionary["data"]
n_sample = dataset[0]
n_feature = dataset[1]
import numpy
csv=[]
count=0
for sample in n_sample:
csv.append(numpy.append(sample,n_feature[count])) #join n_sample and n_feature array
count+=1
numpy.savetxt("foo.csv", csv, delimiter=",")
#data = getDataFromCSV()
# def ScikitDatasetToTAB(dictionary):
# dataset = dictionary["data"]
# n_sample = dataset[0]
# n_feature = dataset[1]
# import numpy
# csv=[]
# count=0
# for sample in n_sample:
# csv.append(numpy.append(sample,n_feature[count])) #join n_sample and n_feature array
# count+=1
# numpy.savetxt("foo.csv", csv, delimiter=" ")
# my_data = genfromtxt("foo.csv", delimiter=" ")
# print my_data
# ScikitDatasetToTAB(data)
def NNK(input_dict):
dataset = input_dict["data"]
n_sample = dataset[0]
n_feature = dataset[1]
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier().fit(n_sample,n_feature)
print knn
def SVC(input_dict):
dataset = input_dict["data"]
n_sample = dataset[0]
n_feature = dataset[1]
from sklearn.svm import SVC
clf = SVC().fit(n_sample,n_feature)
print clf
# SVC(data)
def nearestNeighbour():
import numpy as np
import pylab as pl
from matplotlib.colors import ListedColormap
from sklearn import datasets
from sklearn.neighbors import NearestCentroid
n_neighbors = 15
# import some data to play with
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features. We could
# avoid this ugly slicing by using a two-dim dataset
y = iris.target
h = .02 # step size in the mesh
# Create color maps
cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
for shrinkage in [None, 0.1]:
# we create an instance of Neighbours Classifier and fit the data.
clf = NearestCentroid(shrink_threshold=shrinkage)
clf.fit(X, y)
y_pred = clf.predict(X)
print shrinkage, np.mean(y == y_pred)
# Plot the decision boundary. For that, we will asign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
pl.figure()
pl.pcolormesh(xx, yy, Z, cmap=cmap_light)
# Plot also the training points
pl.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold)
pl.title("3-Class classification (shrink_threshold=%r)"
% shrinkage)
pl.axis('tight')
# pl.show()
#nearestNeighbour()
#
# REGRESSION EXAMPLES
#
#L2-regularized least squares linear model
def RidgeRegression(input_dict):
# from sklearn.datasets import load_iris
# from sklearn import tree
# iris = load_iris()
# clf = tree.DecisionTreeClassifier()
# clf = clf.fit(iris.data, iris.target)
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
n_sample = diabetes.data
n_feature = diabetes.target
print "*******SAMPLES********"
print n_sample
print "******FEARTURES*******"
print n_feature
from sklearn.linear_model import Ridge
rgs = Ridge().fit(n_sample, n_feature)
print rgs
print rgs.predict(n_sample)
# L1+L2-regularized least squares linear model trained using Coordinate Descent
def ElasticNetRegression(input_dict):
# from sklearn.datasets import load_iris
# from sklearn import tree
# iris = load_iris()
# clf = tree.DecisionTreeClassifier()
# clf = clf.fit(iris.data, iris.target)
from sklearn.datasets import load_diabetes
dta = load_diabetes()
n_sample = dta.data
n_feature = dta.target
print "*******SAMPLES********"
print n_sample
print "******FEARTURES*******"
print n_feature
from sklearn.linear_model import ElasticNet
rgs = ElasticNet().fit(n_sample, n_feature)
print rgs
print rgs.predict(n_sample)
# ElasticNetRegression(data)
def ClusteringKMeans():
from sklearn import cluster, datasets
iris = datasets.load_iris()
X_iris = iris.data
y_iris = iris.target
k_means = cluster.KMeans(k=3) #number of clusters
k_means.fit(X_iris)
print k_means.labels_
print y_iris
#ClusteringKMeans()
def ClusteringMS():
from sklearn import cluster, datasets
iris = datasets.load_iris()
X_iris = iris.data
y_iris = iris.target
k_means = cluster.SpectralClustering(3)
k_means.fit(X_iris)
print k_means.labels_
print y_iris
#ClusteringMS()
def test():
print(__doc__)
from sklearn import datasets, neighbors, linear_model
digits = datasets.load_digits()
X_digits = digits.data
y_digits = digits.target
n_samples = len(X_digits)
X_train = X_digits[:.9 * n_samples]
y_train = y_digits[:.9 * n_samples]
X_test = X_digits[.9 * n_samples:]
y_test = y_digits[.9 * n_samples:]
knn = neighbors.KNeighborsClassifier()
logistic = linear_model.LogisticRegression()
print('KNN score: %f' % knn.fit(X_train, y_train).score(X_test, y_test))
print('LogisticRegression score: %f'
% logistic.fit(X_train, y_train).score(X_test, y_test))
#test()
def scikitAlgorithms_UCIDataset(var):
from sklearn import datasets
allDSets = {"iris":datasets.load_iris(), "boston":datasets.load_diabetes(), "diabetes":datasets.load_boston()}
dataset = allDSets[var]
output_dict = dataset
return output_dict
#iris = scikitAlgorithms_UCIDataset("iris")
#print iris
def retrunRightType(value):
try:
a= float(value) if '.' in value else int(value) #return int or float
except ValueError:
a= value #return string
print "input par", a
print "is type of", type(a)
retrunRightType("ahas")
#print "input par", a
#print "is type of", type(a)
......@@ -12,7 +12,7 @@
}
},
{
"pk": 27,
"pk": 30,
"model": "workflows.category",
"fields": {
"uid": "317018eb-3bc8-4862-bce8-273451a01de4",
......@@ -24,10 +24,10 @@
}
},
{
"pk": 151,
"pk": 172,
"model": "workflows.abstractwidget",
"fields": {
"category": 27,
"category": 30,
"treeview_image": "",
"name": "Decision Tree",
"is_streaming": false,
......@@ -46,14 +46,50 @@
"interactive": false,
"has_progress_bar": false,
"order": 1,
"description": "Creates a decision tree"
"description": "max_features : {int, float or string } The number of features to consider when looking for the best split:\r\n\r\nint, then consider max_features features at each split.\r\nfloat, then max_features is a percentage and int(max_features * n_features) features are considered at each split.\r\n“auto”, then max_features=sqrt(n_features).\r\n“sqrt”, then max_features=sqrt(n_features).\r\n“log2”, then max_features=log2(n_features).\r\nNone, then max_features=n_features.\r\n\r\nmax_depth : {integer} The maximum depth of the tree. \r\n"
}
},
{
"pk": 172,
"pk": 376,
"model": "workflows.abstractinput",
"fields": {
"widget": 172,
"name": "max_feature",
"short_name": "par",
"uid": "61d7acc2-72c8-4943-a602-fcbabea6cc0f",
"default": "auto",
"required": false,
"multi": false,
"parameter_type": "text",
"variable": "featureIn",
"parameter": true,
"order": 1,
"description": ""
}
},
{
"pk": 377,
"model": "workflows.abstractinput",
"fields": {
"widget": 172,
"name": "max_depth",
"short_name": "par",
"uid": "d3fe38b1-f3b2-4364-a30e-dac5a27d3dfa",
"default": "100",
"required": false,
"multi": false,
"parameter_type": "text",
"variable": "depthIn",
"parameter": true,
"order": 1,
"description": ""
}
},
{
"pk": 189,
"model": "workflows.abstractoutput",
"fields": {
"widget": 151,
"widget": 172,
"name": "Dataset",
"short_name": "lrn",
"variable": "treeOut",
......@@ -63,10 +99,10 @@
}
},
{
"pk": 158,
"pk": 173,
"model": "workflows.abstractwidget",
"fields": {
"category": 27,
"category": 30,
"treeview_image": "",
"name": "Linear Support Vector Classification",
"is_streaming": false,
......@@ -85,14 +121,146 @@
"interactive": false,
"has_progress_bar": false,
"order": 1,
"description": "Support Vector Machines without kernels based on liblinear"
"description": "Support Vector Machines without kernels based on liblinear\r\n\r\nC : {float} - Penalty parameter C of the error term.\r\nloss : string, ‘l1’ or ‘l2’ (default=’l2’) Specifies the loss function. ‘l1’ is the hinge loss (standard SVM) while ‘l2’ is the squared hinge loss.\r\npenalty : string, ‘l1’ or ‘l2’ (default=’l2’) Specifies the norm used in the penalization. The ‘l2’ penalty is the standard used in SVC. The ‘l1’ leads to coef_ vectors that are sparse.\r\n\r\nmulti_class: string, ‘ovr’ or ‘crammer_singer’ (default=’ovr’) :\r\nDetermines the multi-class strategy if y contains more than two classes. ovr trains n_classes one-vs-rest classifiers, while crammer_singer optimizes a joint objective over all classes. While crammer_singer is interesting from an theoretical perspective as it is consistent it is seldom used in practice and rarely leads to better accuracy and is more expensive to compute. If crammer_singer is chosen, the options loss, penalty and dual will be ignored."
}
},
{
"pk": 175,
"pk": 378,
"model": "workflows.abstractinput",
"fields": {
"widget": 173,
"name": "Penalty parameter(C)",
"short_name": "par",
"uid": "2404bc82-b32b-4787-acd4-82ceef160c22",
"default": "1.0",
"required": false,
"multi": false,
"parameter_type": "text",
"variable": "penaltyIn",
"parameter": true,
"order": 1,
"description": ""
}
},
{
"pk": 379,
"model": "workflows.abstractinput",
"fields": {
"widget": 173,
"name": "Loss parameter(loss)",
"short_name": "par",
"uid": "4511ba8c-015c-4e26-8ca4-164c74afde18",
"default": "l2",
"required": false,
"multi": false,
"parameter_type": "select",
"variable": "lossIn",
"parameter": true,
"order": 2,
"description": ""
}
},
{
"pk": 108,
"model": "workflows.abstractoption",
"fields": {
"uid": "64809f7f-3b98-4b63-9c84-66608c1f6a8c",
"abstract_input": 379,
"value": "l1",
"name": "l1"
}
},
{
"pk": 109,
"model": "workflows.abstractoption",
"fields": {
"uid": "554b3404-de82-418a-bc9e-f7fa530acde1",
"abstract_input": 379,
"value": "l2",
"name": "l2"
}
},
{
"pk": 380,
"model": "workflows.abstractinput",
"fields": {
"widget": 173,
"name": "Norm(penalty)",
"short_name": "par",
"uid": "b8b407d1-c11e-4829-a851-f1cd9ebe1409",
"default": "l2",
"required": false,
"multi": false,
"parameter_type": "select",
"variable": "normIn",
"parameter": true,
"order": 3,
"description": ""
}
},
{
"pk": 110,
"model": "workflows.abstractoption",
"fields": {
"uid": "5c1c4169-8fba-4921-8241-e69da36ee797",
"abstract_input": 380,
"value": "l1",
"name": "l1"
}
},
{
"pk": 111,
"model": "workflows.abstractoption",
"fields": {
"uid": "7739f64c-c5e3-4cb7-b506-f5816c2c09c9",
"abstract_input": 380,
"value": "l2",
"name": "l2"
}
},
{
"pk": 381,
"model": "workflows.abstractinput",
"fields": {
"widget": 173,
"name": "multi_class",
"short_name": "par",
"uid": "da13b053-3d1b-48ad-a046-5683143f1391",
"default": "ovr",
"required": false,
"multi": false,
"parameter_type": "select",
"variable": "classIn",
"parameter": true,
"order": 4,
"description": ""
}
},
{
"pk": 113,
"model": "workflows.abstractoption",
"fields": {
"uid": "b009551e-ca6c-428c-b7a1-8fd3b7b49d08",
"abstract_input": 381,
"value": "crammer_singer",
"name": "crammer_singer"
}
},
{
"pk": 112,
"model": "workflows.abstractoption",
"fields": {
"uid": "2b37e801-b2b0-44ae-b4f8-53e1177845f0",
"abstract_input": 381,
"value": "ovr",
"name": "ovr"
}
},
{
"pk": 190,
"model": "workflows.abstractoutput",
"fields": {
"widget": 158,
"widget": 173,
"name": "Learner",
"short_name": "lrn",
"variable": "SVCout",
......@@ -102,10 +270,10 @@
}
},
{
"pk": 159,
"pk": 174,
"model": "workflows.abstractwidget",
"fields": {
"category": 27,
"category": 30,
"treeview_image": "",
"name": "Logistic Regression",
"is_streaming": false,
......@@ -124,14 +292,70 @@
"interactive": false,
"has_progress_bar": false,
"order": 1,
"description": "penalty : {string} Used to specify the norm used in the penalization.\r\n‘l1’ or ‘l2’\r\n\r\nC : {float} \r\nInverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization."
}
},
{
"pk": 385,
"model": "workflows.abstractinput",
"fields": {
"widget": 174,
"name": "penalty",
"short_name": "par",
"uid": "fb0e5723-41b0-4ef3-9fbd-d88c0a987252",
"default": "l1",
"required": false,
"multi": false,
"parameter_type": "select",
"variable": "penIn",
"parameter": true,
"order": 1,
"description": ""
}
},
{
"pk": 176,
"pk": 118,
"model": "workflows.abstractoption",
"fields": {
"uid": "f8542745-ec0d-4f8e-bd84-f686628fae30",
"abstract_input": 385,
"value": "l1",
"name": "l1"
}
},
{
"pk": 119,
"model": "workflows.abstractoption",
"fields": {
"uid": "9eaf2578-db7b-4341-b9b9-8622aa4d8de8",
"abstract_input": 385,
"value": "l2",
"name": "l2"
}
},
{
"pk": 386,
"model": "workflows.abstractinput",
"fields": {
"widget": 174,
"name": "C",
"short_name": "par",
"uid": "78a44bbf-4da4-46e1-9be2-e3335b04c74b",
"default": "1.0",
"required": false,
"multi": false,
"parameter_type": "text",
"variable": "cIn",
"parameter": true,
"order": 2,