72a39fab-5433-493f-ae22-12a264075356.json 7.89 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
[
  {
    "model": "workflows.abstractwidget", 
    "fields": {
      "category": "1be7b5eb-c1b2-485a-8dbe-56abce63fc73", 
      "treeview_image": "", 
      "uid": "72a39fab-5433-493f-ae22-12a264075356", 
      "is_streaming": false, 
      "package": "big_data", 
      "interaction_view": "", 
      "has_progress_bar": false, 
      "image": "", 
      "description": "Weighted forest with MapReduce\r\n\r\nWeighted forest is a novel ensemble algorithm. \r\n\r\nFit phase\r\nWeighted forest algorithm builds multiple decision trees with a bootstrap method on a subset of data. In each tree node, it estimates sqrt(num. of attributes)+1 randomly selected attributes (without replacement). It uses decision tree to predict out-of-bag samples. For each prediction of an out-of-bag sample, it measures margin (classifier confidence in prediction) and leaf identifier that outputs prediction. Algorithm uses similarity matrix, where it stores similarities for each out-of-bag sample that was predicted with the same leaf. We assume that samples are similar, if the same leaf predicts them multiple times in multiple decision trees. \r\nAfter algorithm builds all decision trees, it passes similarity matrix to k-medoids algorithm. Similarity matrix presents distances between test samples. We set parameter k as sqrt(num. of attributes)+1. k-medoids algorithm outputs medoids, which are test samples in the cluster centers of the dataset. Medoids are actual samples in a dataset, unlike centroids which are centers of clusters. Algorithm measures average margin for all samples that are in the cluster of certain medoid. It saves the average margin of a decision tree in its model. Algorithm uses this scores as weights of decision trees in predict phase.\r\nAlgorithm builds a forest on each subset of the data and it merges them in large ensemble. Each forest has its own medoids.\r\n\r\nPredict phase \r\nAlgorithm selects a forest (or more, if it finds equal similarities with medoids in multiple forests), that contain most similar medoid with a test sample. Then it uses the same procedure like with small data. Algorithm calculates Gower similarity coefficient with a test sample and every medoid. Only decision trees with high margin on similar test samples output prediction and algorithm stores decision tree margin for each prediction. Algorithm calculates final values for each prediction: (number of margins) * avg(margins) and it selects prediction with highest value.", 
      "static_image": "", 
      "action": "wrf_fit", 
      "visualization_view": "", 
      "streaming_visualization_view": "", 
      "post_interact_action": "", 
      "wsdl_method": "", 
      "wsdl": "", 
      "interactive": false, 
      "windows_queue": false, 
      "order": 1, 
      "name": "Weighted forest"
    }
  }, 
  {
    "model": "workflows.abstractinput", 
    "fields": {
      "widget": "72a39fab-5433-493f-ae22-12a264075356", 
31 32 33 34
      "name": "Class majority", 
      "short_name": "csm", 
      "default": "1", 
      "description": "Purity of a subset.", 
35 36
      "required": true, 
      "multi": false, 
37 38 39 40 41
      "parameter_type": "text", 
      "variable": "majority", 
      "parameter": true, 
      "order": 6, 
      "uid": "1b23ead1-b104-4d27-a6fd-b23de6efa28f"
42 43 44 45 46 47
    }
  }, 
  {
    "model": "workflows.abstractinput", 
    "fields": {
      "widget": "72a39fab-5433-493f-ae22-12a264075356", 
48 49 50 51
      "name": "Random state", 
      "short_name": "rsd", 
      "default": "None", 
      "description": "Define a random state", 
52 53 54
      "required": true, 
      "multi": false, 
      "parameter_type": "text", 
55
      "variable": "seed", 
56
      "parameter": true, 
57 58
      "order": 9, 
      "uid": "31c68e34-3bff-41bb-bf77-925c6171a6f6"
59 60 61 62 63 64
    }
  }, 
  {
    "model": "workflows.abstractinput", 
    "fields": {
      "widget": "72a39fab-5433-493f-ae22-12a264075356", 
65 66 67 68
      "name": "Dataset", 
      "short_name": "dst", 
      "default": "", 
      "description": "", 
69 70
      "required": true, 
      "multi": false, 
71 72 73 74 75
      "parameter_type": null, 
      "variable": "dataset", 
      "parameter": false, 
      "order": 1, 
      "uid": "37879268-0aa9-4458-afb2-71a521acb299"
76 77 78 79 80 81
    }
  }, 
  {
    "model": "workflows.abstractinput", 
    "fields": {
      "widget": "72a39fab-5433-493f-ae22-12a264075356", 
82 83 84 85
      "name": "Max tree nodes", 
      "short_name": "mnt", 
      "default": "20", 
      "description": "Max. number of decision tree nodes", 
86 87 88
      "required": true, 
      "multi": false, 
      "parameter_type": "text", 
89
      "variable": "tree_nodes", 
90
      "parameter": true, 
91 92
      "order": 4, 
      "uid": "3d48b0d0-a304-45d5-9d18-3ca17e8fcf05"
93 94 95 96 97 98
    }
  }, 
  {
    "model": "workflows.abstractinput", 
    "fields": {
      "widget": "72a39fab-5433-493f-ae22-12a264075356", 
99 100 101 102
      "name": "Trees per subset", 
      "short_name": "tps", 
      "default": "20", 
      "description": "Number of trees per subset of data", 
103 104 105
      "required": true, 
      "multi": false, 
      "parameter_type": "text", 
106
      "variable": "trees_per_subset", 
107
      "parameter": true, 
108 109
      "order": 3, 
      "uid": "8770ace6-4968-47cf-b7cc-4ad8bfff303c"
110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
    }
  }, 
  {
    "model": "workflows.abstractinput", 
    "fields": {
      "widget": "72a39fab-5433-493f-ae22-12a264075356", 
      "name": "Measure", 
      "short_name": "msr", 
      "default": "info_gain", 
      "description": "Select measure for estimation of attributes.", 
      "required": true, 
      "multi": false, 
      "parameter_type": "select", 
      "variable": "measure", 
      "parameter": true, 
      "order": 7, 
      "uid": "9a8f3c2c-265c-4b37-93c1-d58fee9dd7af"
    }
  }, 
  {
    "model": "workflows.abstractinput", 
    "fields": {
      "widget": "72a39fab-5433-493f-ae22-12a264075356", 
133 134 135 136
      "name": "Min samples split", 
      "short_name": "lmi", 
      "default": "5", 
      "description": "Min. number of samples to split the node", 
137 138
      "required": true, 
      "multi": false, 
139 140
      "parameter_type": "text", 
      "variable": "leaf_min_inst", 
141
      "parameter": true, 
142 143
      "order": 5, 
      "uid": "ac032f38-f4a4-44ea-8c02-96506d4f8e86"
144 145 146 147 148 149
    }
  }, 
  {
    "model": "workflows.abstractinput", 
    "fields": {
      "widget": "72a39fab-5433-493f-ae22-12a264075356", 
150 151 152 153
      "name": "Discretization", 
      "short_name": "spf", 
      "default": "equal_freq", 
      "description": "Select equal frequency discretization or random discretization for numeric attributes", 
154 155
      "required": true, 
      "multi": false, 
156 157
      "parameter_type": "select", 
      "variable": "split_fun", 
158
      "parameter": true, 
159 160
      "order": 8, 
      "uid": "c43bfc92-e1af-42fc-8d73-f7348ebdaf40"
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177
    }
  }, 
  {
    "model": "workflows.abstractoutput", 
    "fields": {
      "widget": "72a39fab-5433-493f-ae22-12a264075356", 
      "name": "Fit model", 
      "short_name": "fit", 
      "description": "Fit model URL", 
      "variable": "fitmodel_url", 
      "order": 1, 
      "uid": "a862dff5-a324-4d45-977c-abda633714c3"
    }
  }, 
  {
    "model": "workflows.abstractoption", 
    "fields": {
178 179 180 181
      "name": "Minimum description length", 
      "uid": "293fb2f7-de3f-4133-8e3a-22701245c55d", 
      "value": "mdl", 
      "abstract_input": "9a8f3c2c-265c-4b37-93c1-d58fee9dd7af"
182 183 184 185 186 187 188 189 190 191 192 193 194 195
    }
  }, 
  {
    "model": "workflows.abstractoption", 
    "fields": {
      "name": "Information gain", 
      "uid": "364fbb94-f200-4acc-a801-d29339f0d4c5", 
      "value": "info_gain", 
      "abstract_input": "9a8f3c2c-265c-4b37-93c1-d58fee9dd7af"
    }
  }, 
  {
    "model": "workflows.abstractoption", 
    "fields": {
196 197 198 199
      "name": "Equal frequency discretization", 
      "uid": "4ea5c55a-92a8-4541-a1cc-9aabb0fd82c0", 
      "value": "equal_freq", 
      "abstract_input": "c43bfc92-e1af-42fc-8d73-f7348ebdaf40"
200 201 202 203 204 205 206 207 208 209 210 211
    }
  }, 
  {
    "model": "workflows.abstractoption", 
    "fields": {
      "name": "Random discretization", 
      "uid": "838f798d-e00e-4216-8990-ebc3c1929c0e", 
      "value": "random", 
      "abstract_input": "c43bfc92-e1af-42fc-8d73-f7348ebdaf40"
    }
  }
]