Commit ebd718cd authored by Matic Perovšek's avatar Matic Perovšek

orange table visualization supports missing values

orange convert table supports no class datasets
database context supports multiple relations between two tables
wordification lowered time complexity
wordification w-items feature
parent 5fc90a4d
This diff is collapsed.
......@@ -89,5 +89,7 @@ def ilp_wordification(input_dict):
target_table = input_dict.get('target_table',None)
other_tables = input_dict.get('other_tables', None)
context = input_dict.get('context', None)
wordification = Wordification(target_table,other_tables,context)
word_att_length = int(input_dict.get('f_ngram_size', 1))
wordification = Wordification(target_table,other_tables,context,word_att_length)
return {'corpus' : wordification.wordify()}
......@@ -2,7 +2,7 @@ from collections import defaultdict
import string
class Wordification(object):
def __init__(self,target_table,other_tables,context):
def __init__(self,target_table,other_tables,context,word_att_length):
"""
Wordification object constructor.
......@@ -12,57 +12,133 @@ class Wordification(object):
self.target_table=target_table
self.other_tables=other_tables
self.context=context
self.word_att_length=word_att_length
self.connecting_tables=defaultdict(list)
self.cached_sentences=defaultdict(dict)
self.lll=defaultdict(int)
#finds table connections
for primary_table in [target_table]+other_tables:
for secondary_table in [target_table]+other_tables:
if (primary_table.name,secondary_table.name) in self.context.connected:
for primary_key,foreign_key in self.context.connected[(primary_table.name,secondary_table.name)]:
if self.context.pkeys[primary_table.name] == primary_key:
self.connecting_tables[primary_table].append((secondary_table,foreign_key))
self.connecting_tables[primary_table].append((secondary_table,foreign_key,None))
#else:
# self.connecting_tables[primary_table].append((secondary_table,primary_key,foreign_key))
self.index_by_value={}
for table in [target_table]+other_tables:
self.index_by_value[table.name]={}
for sec_t,sec_fkey,prim_fkey in [item for sublist in self.connecting_tables.values() for item in sublist]:
#if sec_t==table:
if not prim_fkey:
self.index_by_value[sec_t.name][sec_fkey]=defaultdict(list)
for ex in sec_t:
self.index_by_value[sec_t.name][sec_fkey][str(ex[str(sec_fkey)])].append(ex)
else:
if not prim_fkey in self.index_by_value[sec_t.name]:
self.index_by_value[sec_t.name][prim_fkey]=defaultdict(list)
for ex in sec_t:
self.index_by_value[sec_t.name][prim_fkey][str(ex[str(prim_fkey)])].append(ex)
print self.connecting_tables
def wordify(self):
"""
Applies the wordification methodology on the target table
"""
s=string.join(["!"+str(ex.get_class())+" "+string.join(self.wordify_example(self.target_table,ex)," ") for ex in self.target_table],"\n")
print sorted(self.lll.items(),key=lambda k: [k[1],k[0]],reverse=True)
return s
def wordify_example(self,data,ex):
#class + wordification on every example of the main table
a=[]
for i,ex in enumerate(self.target_table):
a.append("!"+str(ex.get_class())+" "+string.join(self.wordify_example(self.target_table,ex)," "))
s=string.join(a,"\n")
#print sorted(self.lll.items(),key=lambda k: [k[1],k[0]],reverse=True)
#print a
#print "s"
return s#[0:10000000]
def wordify_example(self,data,ex,searched_connections=set([])):
"""
Recursively constructs the 'wordification' document for the given example.
@param data The given examples ExampleTable
@param ex Example for which the document is constructed
"""
debug=False
data_name=str(data.name)
if data_name=="ring_strucs":
print data_name
if debug:
print "======================================"
print "example:",ex
print "table name:", data_name
print "searched_connections:",len(searched_connections),searched_connections
print "connecting_tables:",len(self.connecting_tables[data]),self.connecting_tables[data]
ex_pkey_value=data.name in self.context.pkeys and ex[str(self.context.pkeys[data.name])]
self.lll[data_name+" "+str(ex_pkey_value)]+=1
if not data_name in self.cached_sentences or not str(ex_pkey_value) in self.cached_sentences[data.name]:
#else:
print data_name,str(ex_pkey_value)
#print data_name,str(ex_pkey_value)
words=[] #word list for every example
if debug:
print "words:",len(words)
#Construct words (tableName_attributeName_attributeValue) from the given table
for att in data.domain.attributes:
if not str(att.name) in self.context.pkeys[data.name] and not str(att.name) in self.context.fkeys[data.name]:
words.append(self.att_to_s(data.name)+"_"+self.att_to_s(att.name)+"_"+self.att_to_s(ex[att]))
#Apply the wordification methodology recursively on all connecting tables
for sec_t,sec_fkey in self.connecting_tables[data]:
for sec_ex in sec_t:
if ex_pkey_value and sec_ex[str(sec_fkey)]==ex_pkey_value:
words+=self.wordify_example(sec_t,sec_ex)
#print words
#words from pairs of attributes
single_words=words[:]
if self.word_att_length>1:
for i,att1 in enumerate(single_words):
for j,att2 in enumerate(single_words):
if i<j:
words.append(att1+"__"+att2)
#print "2",words[-1]
if self.word_att_length>2:
for i,att1 in enumerate(single_words):
for j,att2 in enumerate(single_words):
for k,att3 in enumerate(single_words):
if i<j and j<k:
words.append(att1+"__"+att2+"__"+att3)
#print "3",words[-1]
#Apply the wordification methodology recursively on all connecting tables
for sec_t,sec_fkey,prim_fkey in self.connecting_tables[data]:
#for sec_ex in sec_t:
# if ex_pkey_value and sec_ex[str(sec_fkey)]==ex_pkey_value:
# words+=self.wordify_example(sec_t,sec_ex)
#print sec_t,sec_fkey,prim_fkey
if debug:
print "------------------"
print "(sec_t,sec_fkey,prim):",(sec_t.name,sec_fkey,prim_fkey)
print "search this table:",not (sec_t,sec_fkey) in searched_connections and sec_t!=self.target_table
print "search this table:",not prim_fkey or not (data,sec_fkey) in searched_connections# and sec_t!=self.target_table
if not (sec_t,sec_fkey) in searched_connections and sec_t!=self.target_table and (not prim_fkey or not (data,sec_fkey) in searched_connections):
by_value=self.index_by_value[sec_t.name][str(sec_fkey)][str(ex_pkey_value)] if not prim_fkey else self.index_by_value[sec_t.name][str(prim_fkey)][str(ex[str(sec_fkey)])]
for sec_ex in by_value:
words+=self.wordify_example(sec_t,sec_ex,searched_connections | set([(sec_t,sec_fkey),prim_fkey and (data,prim_fkey)]))
self.cached_sentences[data_name][str(ex_pkey_value)]=words
else:
print data_name,str(ex_pkey_value), "cache: hit"
return self.cached_sentences[data_name][str(ex_pkey_value)]
def att_to_s(self,att):
......@@ -71,5 +147,4 @@ class Wordification(object):
@param att Orange attribute
"""
return str(att).title().replace(' ','').replace('_','')
......@@ -63,8 +63,8 @@ class DBContext:
if col.endswith('_id'):
ref_table = (col[:-4] + 'ies') if col[-4] == 'y' else (col[:-3] + 's')
if ref_table in self.tables:
self.connected[(table, ref_table)] = (col, 'id')
self.connected[(ref_table, table)] = ('id', col)
self.connected[(table, ref_table)].append((col, 'id'))
self.connected[(ref_table, table)].append(('id', col))
self.fkeys[table].add(col)
if col == 'id':
self.pkeys[table] = col
......
......@@ -259,7 +259,7 @@ class Orange_Converter(Converter):
import orange
cols = self.db.cols[table_name]
attributes, metas, class_var = [], [], []
attributes, metas, class_var = [], [], None
for col in cols:
att_type = self.orng_type(table_name,col)
if att_type == 'd':
......@@ -272,13 +272,13 @@ class Orange_Converter(Converter):
if col == cls_att:
if att_type == 'string':
raise Exception('Unsuitable data type for a target variable: %s' % att_type)
class_var.append(att_var)
class_var=att_var
continue
elif att_type == 'string' or col in self.db.pkeys[table_name] or col in self.db.fkeys[table_name]:
elif att_type == 'string' or table_name in self.db.pkeys and col in self.db.pkeys[table_name] or table_name in self.db.fkeys and col in self.db.fkeys[table_name]:
metas.append(att_var)
else:
attributes.append(att_var)
domain = orange.Domain(attributes + class_var)
domain = orange.Domain(attributes, class_var)
for meta in metas:
domain.addmeta(orange.newmetaid(), meta)
dataset = orange.ExampleTable(domain)
......@@ -286,7 +286,7 @@ class Orange_Converter(Converter):
for row in self.db.rows(table_name, cols):
example = orange.Example(domain)
for col, val in zip(cols, row):
example[str(col)] = str(val)
example[str(col)] = str(val) if val!=None else '?'
dataset.append(example)
return dataset
......
This diff is collapsed.
......@@ -13,6 +13,10 @@ def mysql_db_context(request, input_dict, output_dict, widget):
initial_context = DBContext(con, find_connections=find_con)
initial_target_cols = initial_context.cols[initial_context.target_table]
cols_dump = json.dumps(initial_context.cols)
return render(request, 'interactions/db_context.html', {'widget':widget, 'context': initial_context, 'target_cols' : initial_target_cols, 'cols' : cols_dump})
return render(request, 'interactions/db_context.html', {'widget':widget,
'context': initial_context,
'connections' : dict(initial_context.connected),
'target_cols' : initial_target_cols,
'cols' : cols_dump})
......@@ -33,7 +33,7 @@
<tr><th>Table</th><th>Referenced table</th><th>Column</th><th>Referenced column</th></tr>
</thead>
<tbody>
{% for tables, cols in context.connected.items %}
{% for tables, cols in connections.items %}
{% for cols_inner in cols %}
<tr><td>{{tables.0}}</td><td>{{tables.1}}</td><td>{{cols_inner.0}}</td><td>{{cols_inner.1}}</td></tr>
{% endfor %}
......
This diff is collapsed.
def benchmark(input_dict):
import time
in_att = input_dict.get('in_att', None)
start_time= input_dict.get('start_time', None)
time_diff=(time.time()-start_time) if start_time else time.time()
return {'out_att': in_att, 'time_diff': time_diff}
......@@ -123,7 +123,7 @@ def orng_table_to_dict(data):
metas.append(data.domain.get_meta(m).name)
for a in data.domain.attributes:
attrs.append(a.name)
pretty_float = lambda x, a: '%.3f' % x if a.var_type == Orange.feature.Type.Continuous else x
pretty_float = lambda x, a: '%.3f' % x if a.var_type == Orange.feature.Type.Continuous and x!='?' else x
for inst in xrange(len(data)):
inst_new = []
for a in data.domain.variables:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment