Commit ebd718cd authored by Matic Perovšek's avatar Matic Perovšek

orange table visualization supports missing values

orange convert table supports no class datasets
database context supports multiple relations between two tables
wordification lowered time complexity
wordification w-items feature
parent 5fc90a4d
This diff is collapsed.
...@@ -89,5 +89,7 @@ def ilp_wordification(input_dict): ...@@ -89,5 +89,7 @@ def ilp_wordification(input_dict):
target_table = input_dict.get('target_table',None) target_table = input_dict.get('target_table',None)
other_tables = input_dict.get('other_tables', None) other_tables = input_dict.get('other_tables', None)
context = input_dict.get('context', None) context = input_dict.get('context', None)
wordification = Wordification(target_table,other_tables,context) word_att_length = int(input_dict.get('f_ngram_size', 1))
wordification = Wordification(target_table,other_tables,context,word_att_length)
return {'corpus' : wordification.wordify()} return {'corpus' : wordification.wordify()}
...@@ -2,7 +2,7 @@ from collections import defaultdict ...@@ -2,7 +2,7 @@ from collections import defaultdict
import string import string
class Wordification(object): class Wordification(object):
def __init__(self,target_table,other_tables,context): def __init__(self,target_table,other_tables,context,word_att_length):
""" """
Wordification object constructor. Wordification object constructor.
...@@ -12,57 +12,133 @@ class Wordification(object): ...@@ -12,57 +12,133 @@ class Wordification(object):
self.target_table=target_table self.target_table=target_table
self.other_tables=other_tables self.other_tables=other_tables
self.context=context self.context=context
self.word_att_length=word_att_length
self.connecting_tables=defaultdict(list) self.connecting_tables=defaultdict(list)
self.cached_sentences=defaultdict(dict) self.cached_sentences=defaultdict(dict)
self.lll=defaultdict(int) self.lll=defaultdict(int)
#finds table connections #finds table connections
for primary_table in [target_table]+other_tables: for primary_table in [target_table]+other_tables:
for secondary_table in [target_table]+other_tables: for secondary_table in [target_table]+other_tables:
if (primary_table.name,secondary_table.name) in self.context.connected: if (primary_table.name,secondary_table.name) in self.context.connected:
for primary_key,foreign_key in self.context.connected[(primary_table.name,secondary_table.name)]: for primary_key,foreign_key in self.context.connected[(primary_table.name,secondary_table.name)]:
if self.context.pkeys[primary_table.name] == primary_key: if self.context.pkeys[primary_table.name] == primary_key:
self.connecting_tables[primary_table].append((secondary_table,foreign_key)) self.connecting_tables[primary_table].append((secondary_table,foreign_key,None))
#else:
# self.connecting_tables[primary_table].append((secondary_table,primary_key,foreign_key))
self.index_by_value={}
for table in [target_table]+other_tables:
self.index_by_value[table.name]={}
for sec_t,sec_fkey,prim_fkey in [item for sublist in self.connecting_tables.values() for item in sublist]:
#if sec_t==table:
if not prim_fkey:
self.index_by_value[sec_t.name][sec_fkey]=defaultdict(list)
for ex in sec_t:
self.index_by_value[sec_t.name][sec_fkey][str(ex[str(sec_fkey)])].append(ex)
else:
if not prim_fkey in self.index_by_value[sec_t.name]:
self.index_by_value[sec_t.name][prim_fkey]=defaultdict(list)
for ex in sec_t:
self.index_by_value[sec_t.name][prim_fkey][str(ex[str(prim_fkey)])].append(ex)
print self.connecting_tables
def wordify(self): def wordify(self):
""" """
Applies the wordification methodology on the target table Applies the wordification methodology on the target table
""" """
s=string.join(["!"+str(ex.get_class())+" "+string.join(self.wordify_example(self.target_table,ex)," ") for ex in self.target_table],"\n")
print sorted(self.lll.items(),key=lambda k: [k[1],k[0]],reverse=True)
return s
def wordify_example(self,data,ex): #class + wordification on every example of the main table
a=[]
for i,ex in enumerate(self.target_table):
a.append("!"+str(ex.get_class())+" "+string.join(self.wordify_example(self.target_table,ex)," "))
s=string.join(a,"\n")
#print sorted(self.lll.items(),key=lambda k: [k[1],k[0]],reverse=True)
#print a
#print "s"
return s#[0:10000000]
def wordify_example(self,data,ex,searched_connections=set([])):
""" """
Recursively constructs the 'wordification' document for the given example. Recursively constructs the 'wordification' document for the given example.
@param data The given examples ExampleTable @param data The given examples ExampleTable
@param ex Example for which the document is constructed @param ex Example for which the document is constructed
""" """
debug=False
data_name=str(data.name) data_name=str(data.name)
if data_name=="ring_strucs":
print data_name
if debug:
print "======================================"
print "example:",ex
print "table name:", data_name
print "searched_connections:",len(searched_connections),searched_connections
print "connecting_tables:",len(self.connecting_tables[data]),self.connecting_tables[data]
ex_pkey_value=data.name in self.context.pkeys and ex[str(self.context.pkeys[data.name])] ex_pkey_value=data.name in self.context.pkeys and ex[str(self.context.pkeys[data.name])]
self.lll[data_name+" "+str(ex_pkey_value)]+=1 self.lll[data_name+" "+str(ex_pkey_value)]+=1
if not data_name in self.cached_sentences or not str(ex_pkey_value) in self.cached_sentences[data.name]: if not data_name in self.cached_sentences or not str(ex_pkey_value) in self.cached_sentences[data.name]:
#else: #else:
print data_name,str(ex_pkey_value) #print data_name,str(ex_pkey_value)
words=[] #word list for every example words=[] #word list for every example
if debug:
print "words:",len(words)
#Construct words (tableName_attributeName_attributeValue) from the given table #Construct words (tableName_attributeName_attributeValue) from the given table
for att in data.domain.attributes: for att in data.domain.attributes:
if not str(att.name) in self.context.pkeys[data.name] and not str(att.name) in self.context.fkeys[data.name]: if not str(att.name) in self.context.pkeys[data.name] and not str(att.name) in self.context.fkeys[data.name]:
words.append(self.att_to_s(data.name)+"_"+self.att_to_s(att.name)+"_"+self.att_to_s(ex[att])) words.append(self.att_to_s(data.name)+"_"+self.att_to_s(att.name)+"_"+self.att_to_s(ex[att]))
#Apply the wordification methodology recursively on all connecting tables #words from pairs of attributes
for sec_t,sec_fkey in self.connecting_tables[data]: single_words=words[:]
for sec_ex in sec_t: if self.word_att_length>1:
if ex_pkey_value and sec_ex[str(sec_fkey)]==ex_pkey_value: for i,att1 in enumerate(single_words):
words+=self.wordify_example(sec_t,sec_ex) for j,att2 in enumerate(single_words):
#print words if i<j:
words.append(att1+"__"+att2)
#print "2",words[-1]
if self.word_att_length>2:
for i,att1 in enumerate(single_words):
for j,att2 in enumerate(single_words):
for k,att3 in enumerate(single_words):
if i<j and j<k:
words.append(att1+"__"+att2+"__"+att3)
#print "3",words[-1]
#Apply the wordification methodology recursively on all connecting tables
for sec_t,sec_fkey,prim_fkey in self.connecting_tables[data]:
#for sec_ex in sec_t:
# if ex_pkey_value and sec_ex[str(sec_fkey)]==ex_pkey_value:
# words+=self.wordify_example(sec_t,sec_ex)
#print sec_t,sec_fkey,prim_fkey
if debug:
print "------------------"
print "(sec_t,sec_fkey,prim):",(sec_t.name,sec_fkey,prim_fkey)
print "search this table:",not (sec_t,sec_fkey) in searched_connections and sec_t!=self.target_table
print "search this table:",not prim_fkey or not (data,sec_fkey) in searched_connections# and sec_t!=self.target_table
if not (sec_t,sec_fkey) in searched_connections and sec_t!=self.target_table and (not prim_fkey or not (data,sec_fkey) in searched_connections):
by_value=self.index_by_value[sec_t.name][str(sec_fkey)][str(ex_pkey_value)] if not prim_fkey else self.index_by_value[sec_t.name][str(prim_fkey)][str(ex[str(sec_fkey)])]
for sec_ex in by_value:
words+=self.wordify_example(sec_t,sec_ex,searched_connections | set([(sec_t,sec_fkey),prim_fkey and (data,prim_fkey)]))
self.cached_sentences[data_name][str(ex_pkey_value)]=words self.cached_sentences[data_name][str(ex_pkey_value)]=words
else:
print data_name,str(ex_pkey_value), "cache: hit"
return self.cached_sentences[data_name][str(ex_pkey_value)] return self.cached_sentences[data_name][str(ex_pkey_value)]
def att_to_s(self,att): def att_to_s(self,att):
...@@ -71,5 +147,4 @@ class Wordification(object): ...@@ -71,5 +147,4 @@ class Wordification(object):
@param att Orange attribute @param att Orange attribute
""" """
return str(att).title().replace(' ','').replace('_','') return str(att).title().replace(' ','').replace('_','')
...@@ -63,8 +63,8 @@ class DBContext: ...@@ -63,8 +63,8 @@ class DBContext:
if col.endswith('_id'): if col.endswith('_id'):
ref_table = (col[:-4] + 'ies') if col[-4] == 'y' else (col[:-3] + 's') ref_table = (col[:-4] + 'ies') if col[-4] == 'y' else (col[:-3] + 's')
if ref_table in self.tables: if ref_table in self.tables:
self.connected[(table, ref_table)] = (col, 'id') self.connected[(table, ref_table)].append((col, 'id'))
self.connected[(ref_table, table)] = ('id', col) self.connected[(ref_table, table)].append(('id', col))
self.fkeys[table].add(col) self.fkeys[table].add(col)
if col == 'id': if col == 'id':
self.pkeys[table] = col self.pkeys[table] = col
......
...@@ -259,7 +259,7 @@ class Orange_Converter(Converter): ...@@ -259,7 +259,7 @@ class Orange_Converter(Converter):
import orange import orange
cols = self.db.cols[table_name] cols = self.db.cols[table_name]
attributes, metas, class_var = [], [], [] attributes, metas, class_var = [], [], None
for col in cols: for col in cols:
att_type = self.orng_type(table_name,col) att_type = self.orng_type(table_name,col)
if att_type == 'd': if att_type == 'd':
...@@ -272,13 +272,13 @@ class Orange_Converter(Converter): ...@@ -272,13 +272,13 @@ class Orange_Converter(Converter):
if col == cls_att: if col == cls_att:
if att_type == 'string': if att_type == 'string':
raise Exception('Unsuitable data type for a target variable: %s' % att_type) raise Exception('Unsuitable data type for a target variable: %s' % att_type)
class_var.append(att_var) class_var=att_var
continue continue
elif att_type == 'string' or col in self.db.pkeys[table_name] or col in self.db.fkeys[table_name]: elif att_type == 'string' or table_name in self.db.pkeys and col in self.db.pkeys[table_name] or table_name in self.db.fkeys and col in self.db.fkeys[table_name]:
metas.append(att_var) metas.append(att_var)
else: else:
attributes.append(att_var) attributes.append(att_var)
domain = orange.Domain(attributes + class_var) domain = orange.Domain(attributes, class_var)
for meta in metas: for meta in metas:
domain.addmeta(orange.newmetaid(), meta) domain.addmeta(orange.newmetaid(), meta)
dataset = orange.ExampleTable(domain) dataset = orange.ExampleTable(domain)
...@@ -286,7 +286,7 @@ class Orange_Converter(Converter): ...@@ -286,7 +286,7 @@ class Orange_Converter(Converter):
for row in self.db.rows(table_name, cols): for row in self.db.rows(table_name, cols):
example = orange.Example(domain) example = orange.Example(domain)
for col, val in zip(cols, row): for col, val in zip(cols, row):
example[str(col)] = str(val) example[str(col)] = str(val) if val!=None else '?'
dataset.append(example) dataset.append(example)
return dataset return dataset
......
This diff is collapsed.
...@@ -13,6 +13,10 @@ def mysql_db_context(request, input_dict, output_dict, widget): ...@@ -13,6 +13,10 @@ def mysql_db_context(request, input_dict, output_dict, widget):
initial_context = DBContext(con, find_connections=find_con) initial_context = DBContext(con, find_connections=find_con)
initial_target_cols = initial_context.cols[initial_context.target_table] initial_target_cols = initial_context.cols[initial_context.target_table]
cols_dump = json.dumps(initial_context.cols) cols_dump = json.dumps(initial_context.cols)
return render(request, 'interactions/db_context.html', {'widget':widget, 'context': initial_context, 'target_cols' : initial_target_cols, 'cols' : cols_dump}) return render(request, 'interactions/db_context.html', {'widget':widget,
'context': initial_context,
'connections' : dict(initial_context.connected),
'target_cols' : initial_target_cols,
'cols' : cols_dump})
...@@ -33,7 +33,7 @@ ...@@ -33,7 +33,7 @@
<tr><th>Table</th><th>Referenced table</th><th>Column</th><th>Referenced column</th></tr> <tr><th>Table</th><th>Referenced table</th><th>Column</th><th>Referenced column</th></tr>
</thead> </thead>
<tbody> <tbody>
{% for tables, cols in context.connected.items %} {% for tables, cols in connections.items %}
{% for cols_inner in cols %} {% for cols_inner in cols %}
<tr><td>{{tables.0}}</td><td>{{tables.1}}</td><td>{{cols_inner.0}}</td><td>{{cols_inner.1}}</td></tr> <tr><td>{{tables.0}}</td><td>{{tables.1}}</td><td>{{cols_inner.0}}</td><td>{{cols_inner.1}}</td></tr>
{% endfor %} {% endfor %}
......
This diff is collapsed.
def benchmark(input_dict):
import time
in_att = input_dict.get('in_att', None)
start_time= input_dict.get('start_time', None)
time_diff=(time.time()-start_time) if start_time else time.time()
return {'out_att': in_att, 'time_diff': time_diff}
...@@ -123,7 +123,7 @@ def orng_table_to_dict(data): ...@@ -123,7 +123,7 @@ def orng_table_to_dict(data):
metas.append(data.domain.get_meta(m).name) metas.append(data.domain.get_meta(m).name)
for a in data.domain.attributes: for a in data.domain.attributes:
attrs.append(a.name) attrs.append(a.name)
pretty_float = lambda x, a: '%.3f' % x if a.var_type == Orange.feature.Type.Continuous else x pretty_float = lambda x, a: '%.3f' % x if a.var_type == Orange.feature.Type.Continuous and x!='?' else x
for inst in xrange(len(data)): for inst in xrange(len(data)):
inst_new = [] inst_new = []
for a in data.domain.variables: for a in data.domain.variables:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment