converters.py 20.9 KB
Newer Older
1
2
3
4
5
'''
Classes for handling DBContexts for ILP systems.

@author: Anze Vavpetic <anze.vavpetic@ijs.si>
'''
6
import re
7

8
class Converter:
9
    '''
10
    Base class for converters.
11
    '''
12
    def __init__(self, dbcontext):
13
14
15
16
        self.db = dbcontext
        self.connection = dbcontext.connection.connect()
        self.cursor = self.connection.cursor()

17
    def __del__(self):  
18
19
        self.connection.close()

20
21
22
23
24
25
26
27
class ILP_Converter(Converter):
    '''
    Base class for converting between a given database context (selected tables, columns, etc)
    to inputs acceptable by a specific ILP system.

    If possible, all subclasses should use lazy selects by forwarding the DB connection.
    '''
    def __init__(self, *args, **kwargs):
28
29
        self.settings = kwargs.pop('settings', {}) if kwargs else {}
        self.discr_intervals = kwargs.pop('discr_intervals', {}) if kwargs else {}
30
        self.dump = kwargs.pop('dump', True) if kwargs else True
31
32
        Converter.__init__(self, *args, **kwargs)

Anze Vavpetic's avatar
Anze Vavpetic committed
33
34
    def user_settings(self):
        return [':- set(%s,%s).' % (key,val) for key, val in self.settings.items()]
35
36

    def mode(self, predicate, args, recall=1, head=False):
Anze Vavpetic's avatar
Anze Vavpetic committed
37
38
        return ':- mode%s(%s, %s(%s)).' % ('h' if head else 'b', str(recall), predicate, ','.join([t+arg for t,arg in args]))

39
40
    def connecting_clause(self, table, ref_table):
        var_table, var_ref_table = table.capitalize(), ref_table.capitalize()
41
42
43
44
45
46
47
        result=[]
        for pk,fk in self.db.connected[(table, ref_table)]:
            ref_pk = self.db.pkeys[ref_table]
            table_args, ref_table_args = [], []
            for col in self.db.cols[table]:
                if col == pk:
                    col = var_table
48
                elif col in fk:
49
50
51
52
53
                    col = var_ref_table
                table_args.append(col.capitalize())
            for col in self.db.cols[ref_table]:
                if col == ref_pk:
                    col = var_ref_table
54
                if col in fk:
55
56
                    col = var_table
                ref_table_args.append(col.capitalize())
57
58
59
60
            result.extend(['%s_has_%s(%s, %s) :-' % (var_table.lower(),
                                                     ref_table, 
                                                     var_table.capitalize(), 
                                                     var_ref_table.capitalize()),
61
62
63
64
                            '\t%s(%s),' % (table, ','.join(table_args)),
                            '\t%s(%s).' % (ref_table, ','.join(ref_table_args))])
        return result

65
66

    def attribute_clause(self, table, att):
67
        var_table, var_att, pk = table.capitalize(), att.capitalize(), self.db.pkeys[table]
68
69
70
71
72
73
74
75
76
77
78
79
        intervals = []
        if self.discr_intervals.has_key(table):
            intervals = self.discr_intervals[table].get(att, [])
            if intervals:
                var_att = 'Discrete_%s' % var_att
        values_goal = '\t%s(%s)%s' % (table, ','.join([arg.capitalize() if arg!=pk else var_table for arg in self.db.cols[table]]), ',' if intervals else '.')
        discretize_goals = []
        n_intervals = len(intervals)
        for i, value in enumerate(intervals):
            punct = '.' if i == n_intervals-1 else ';'
            if i == 0:
                # Condition: att =< value_i
Matic Perovšek's avatar
Matic Perovšek committed
80
81
                label = '=< %.2f' % value
                condition = '%s =< %.2f' % (att.capitalize(), value)
82
83
84
85
                discretize_goals.append('\t((%s = \'%s\', %s)%s' % (var_att, label, condition, punct))
            if i < n_intervals-1:
                # Condition: att in (value_i, value_i+1]
                value_next = intervals[i+1]
Matic Perovšek's avatar
Matic Perovšek committed
86
87
                label = '(%.2f, %.2f]' % (value, value_next)
                condition = '%s > %.2f, %s =< %.2f' % (att.capitalize(), value, att.capitalize(), value_next)
88
89
90
                discretize_goals.append('\t(%s = \'%s\', %s)%s' % (var_att, label, condition, punct))
            else:
                # Condition: att > value_i
Matic Perovšek's avatar
Matic Perovšek committed
91
92
                label = '> %.2f' % value
                condition = '%s > %.2f' % (att.capitalize(), value)
93
94
95
                discretize_goals.append('\t(%s = \'%s\', %s))%s' % (var_att, label, condition, punct))
        return ['%s_%s(%s, %s) :-' % (table, att, var_table, var_att),  
                values_goal] + discretize_goals
96

97
98
99
100
    @staticmethod
    def numeric(val):
        for num_type in [int, float, long, complex]:
            try:
101
                num_type(str(val))
102
103
104
105
106
107
108
                return True
            except:
                pass
        return False

    def dump_tables(self):
        dump = []
109
        fmt_cols = lambda cols: ','.join([("%s" % col) if ILP_Converter.numeric(col) else ("'%s'" % str(col).replace("'", '"')) for col in cols])
110
111
        for table in self.db.tables:
            attributes = self.db.cols[table]
112
            dump.append(':- table %s/%d.' % (table, len(attributes)))
113
114
115
            dump.append('\n'.join(["%s(%s)." % (table, fmt_cols(cols)) for cols in self.db.rows(table, attributes)]))
        return dump

116
class RSD_Converter(ILP_Converter):
Anze Vavpetic's avatar
Anze Vavpetic committed
117
118
119
    '''
    Converts the database context to RSD inputs.
    '''
120
121
122
    def all_examples(self, target=None):
        if not target:
            target = self.db.target_table
123
124
        examples = self.db.rows(target, [self.db.target_att, self.db.pkeys[target]])
        return '\n'.join(["%s('%s', %s)." % (target, cls, pk) for cls, pk in examples])
125

Anze Vavpetic's avatar
Anze Vavpetic committed
126
127
128
129
130
    def background_knowledge(self):
        modeslist, getters = [self.mode(self.db.target_table, [('+', self.db.target_table)], head=True)], []
        for (table, ref_table) in self.db.connected.keys():
            if ref_table == self.db.target_table:
                continue # Skip backward connections
131
            modeslist.append(self.mode('%s_has_%s' % (table.lower(), ref_table), [('+', table), ('-', ref_table)]))
Anze Vavpetic's avatar
Anze Vavpetic committed
132
133
134
135
136
137
138
139
140
            getters.extend(self.connecting_clause(table, ref_table))
        for table, atts in self.db.cols.items():
            for att in atts:
                if att == self.db.target_att and table == self.db.target_table or \
                   att in self.db.fkeys[table] or att == self.db.pkeys[table]:
                    continue
                modeslist.append(self.mode('%s_%s' % (table, att), [('+', table), ('-', att)]))
                modeslist.append(self.mode('instantiate', [('+', att)]))
                getters.extend(self.attribute_clause(table, att))
141
142

        return '\n'.join(modeslist + getters + self.user_settings() + self.dump_tables())
143

144
class Aleph_Converter(ILP_Converter):
145
146
147
    '''
    Converts the database context to Aleph inputs.
    '''
Anze Vavpetic's avatar
Anze Vavpetic committed
148
    def __init__(self, *args, **kwargs):
149
        self.target_att_val = kwargs.pop('target_att_val')
150
        ILP_Converter.__init__(self, *args, **kwargs)
Anze Vavpetic's avatar
Anze Vavpetic committed
151
        self.__pos_examples, self.__neg_examples = None, None
152
        self.target_predicate = re.sub('\s+', '_', self.target_att_val).lower()
Anze Vavpetic's avatar
Anze Vavpetic committed
153

154
155
156
    def __target_predicate(self):
        return 'target_%s' % self.target_predicate

Anze Vavpetic's avatar
Anze Vavpetic committed
157
158
    def __examples(self):
        if not (self.__pos_examples and self.__neg_examples):
159
160
            target, att, target_val = self.db.target_table, self.db.target_att, self.target_att_val
            rows = self.db.rows(target, [att, self.db.pkeys[target]])
Anze Vavpetic's avatar
Anze Vavpetic committed
161
162
            pos_rows, neg_rows = [], []
            for row in rows:
163
                if str(row[0]) == target_val:
Anze Vavpetic's avatar
Anze Vavpetic committed
164
165
166
                    pos_rows.append(row)
                else:
                    neg_rows.append(row)
167
168
169
170

            if not pos_rows:
                raise Exception('No positive examples with the given target attribute value, please re-check.')

171
172
            self.__pos_examples = '\n'.join(['%s(%s).' % (self.__target_predicate(), id) for _, id in pos_rows])
            self.__neg_examples = '\n'.join(['%s(%s).' % (self.__target_predicate(), id) for _, id in neg_rows])
Anze Vavpetic's avatar
Anze Vavpetic committed
173
174
        return self.__pos_examples, self.__neg_examples

175
    def positive_examples(self):
Anze Vavpetic's avatar
Anze Vavpetic committed
176
177
        return self.__examples()[0]

178
    def negative_examples(self):
Anze Vavpetic's avatar
Anze Vavpetic committed
179
180
        return self.__examples()[1]

181
    def background_knowledge(self):
182
        modeslist, getters = [self.mode(self.__target_predicate(), [('+', self.db.target_table)], head=True)], []
Anze Vavpetic's avatar
Anze Vavpetic committed
183
184
185
186
        determinations, types = [], []
        for (table, ref_table) in self.db.connected.keys():
            if ref_table == self.db.target_table:
                continue # Skip backward connections
187
188
            modeslist.append(self.mode('%s_has_%s' % (table.lower(), ref_table), [('+', table), ('-', ref_table)], recall='*'))
            determinations.append(':- determination(%s/1, %s_has_%s/2).' % (self.__target_predicate(), table.lower(), ref_table))
Anze Vavpetic's avatar
Anze Vavpetic committed
189
190
191
192
193
194
195
196
            types.extend(self.concept_type_def(table))
            types.extend(self.concept_type_def(ref_table))
            getters.extend(self.connecting_clause(table, ref_table))
        for table, atts in self.db.cols.items():
            for att in atts:
                if att == self.db.target_att and table == self.db.target_table or \
                   att in self.db.fkeys[table] or att == self.db.pkeys[table]:
                    continue
197
                modeslist.append(self.mode('%s_%s' % (table, att), [('+', table), ('#', att.lower())], recall='*'))
198
                determinations.append(':- determination(%s/1, %s_%s/2).' % (self.__target_predicate(), table, att))
Anze Vavpetic's avatar
Anze Vavpetic committed
199
200
                types.extend(self.constant_type_def(table, att))
                getters.extend(self.attribute_clause(table, att))
201
        return '\n'.join(self.user_settings() + modeslist + determinations + types + getters + self.dump_tables())
Anze Vavpetic's avatar
Anze Vavpetic committed
202
203

    def concept_type_def(self, table):
204
205
206
207
        var_pk = self.db.pkeys[table].capitalize()
        variables = ','.join([var_pk if col.capitalize() == var_pk else '_' for col in self.db.cols[table]])
        return ['%s(%s) :-' % (table, var_pk), 
                '\t%s(%s).' % (table, variables)]
Anze Vavpetic's avatar
Anze Vavpetic committed
208
209

    def constant_type_def(self, table, att):
210
211
        var_att = att.capitalize()
        variables = ','.join([var_att if col == att else '_' for col in self.db.cols[table]])
212
        return ['%s(%s) :-' % (att.lower(), var_att), 
213
                '\t%s(%s).' % (table, variables)]
214

215
216
217

class Orange_Converter(Converter):
    '''
Matic Perovšek's avatar
Matic Perovšek committed
218
    Converts the selected tables in the given context to orange example tables.
219
220
    '''
    continuous_types = ('FLOAT','DOUBLE','DECIMAL','NEWDECIMAL')
221
222
    integer_types = ('TINY','SHORT','LONG','LONGLONG','INT24')
    ordinal_types = ('YEAR','VARCHAR','SET','VAR_STRING','STRING','BIT')
223
224
225
    
    def __init__(self, *args, **kwargs):
        Converter.__init__(self, *args, **kwargs)
Matic Perovšek's avatar
Matic Perovšek committed
226
227
228
        self.types={}
        for table in self.db.tables:
            self.types[table]= self.db.fetch_types(table, self.db.cols[table])
229
        self.db.compute_col_vals()
230

Matic Perovšek's avatar
Matic Perovšek committed
231
232
    def target_Orange_table(self):
        table, cls_att = self.db.target_table, self.db.target_att
233
234
235
236
        if not self.db.orng_tables:
            return self.convert_table(table, cls_att=cls_att)
        else:
            return self.db.orng_tables[table]
Matic Perovšek's avatar
Matic Perovšek committed
237
238
239

    def other_Orange_tables(self):
        target_table = self.db.target_table
240
241
242
243
        if not self.db.orng_tables:
            return [self.convert_table(table,None) for table in self.db.tables if table!=target_table]
        else:
            return [table for name, table in self.db.orng_tables.items() if name != target_table]
Matic Perovšek's avatar
Matic Perovšek committed
244

245
    def convert_table(self, table_name, cls_att=None):
246
247
248
249
        '''
        Returns the target table as an orange example table.
        '''
        import orange
Matic Perovšek's avatar
Matic Perovšek committed
250
251

        cols = self.db.cols[table_name]
252
        attributes, metas, class_var = [], [], None
253
        for col in cols:
Matic Perovšek's avatar
Matic Perovšek committed
254
            att_type = self.orng_type(table_name,col)
255
            if att_type == 'd':
Matic Perovšek's avatar
Matic Perovšek committed
256
                att_vals = self.db.col_vals[table_name][col]
257
                att_var = orange.EnumVariable(str(col), values=[str(val) for val in att_vals])
258
259
260
261
262
263
            elif att_type == 'c':
                att_var = orange.FloatVariable(str(col))
            else:
                att_var = orange.StringVariable(str(col))
            if col == cls_att:
                if att_type == 'string':
Matic Perovšek's avatar
Matic Perovšek committed
264
                    raise Exception('Unsuitable data type for a target variable: %s' % att_type)
265
                class_var=att_var
266
                continue
267
            elif att_type == 'string' or table_name in self.db.pkeys and col in self.db.pkeys[table_name] or table_name in self.db.fkeys and col in self.db.fkeys[table_name]:
268
269
270
                metas.append(att_var)
            else:
                attributes.append(att_var)
271
        domain = orange.Domain(attributes, class_var)
272
273
274
        for meta in metas:
            domain.addmeta(orange.newmetaid(), meta)
        dataset = orange.ExampleTable(domain)
Matic Perovšek's avatar
Matic Perovšek committed
275
276
        dataset.name=table_name
        for row in self.db.rows(table_name, cols):
277
278
            example = orange.Example(domain)
            for col, val in zip(cols, row):
279
                example[str(col)] = str(val) if val!=None else '?'
280
281
282
            dataset.append(example)
        return dataset

283
    def orng_type(self, table_name, col):
284
285
286
        '''
        Assigns a given mysql column an orange type.
        '''
Matic Perovšek's avatar
Matic Perovšek committed
287
288
        mysql_type = self.types[table_name][col]
        n_vals = len(self.db.col_vals[table_name][col])
289
        if mysql_type in Orange_Converter.continuous_types or (n_vals >= 50 and mysql_type in Orange_Converter.integer_types):
290
            return 'c'
Matic Perovšek's avatar
Matic Perovšek committed
291
        elif mysql_type in Orange_Converter.ordinal_types+Orange_Converter.integer_types:
292
293
294
295
            return 'd'
        else:
            return 'string'

296
297
298
299
300
301
302

class TreeLikerConverter(Converter):
    '''
    Converts a db context to the TreeLiker dataset format.
    '''
    def __init__(self, *args, **kwargs):
        self.discr_intervals = kwargs.pop('discr_intervals', {}) if kwargs else {}
Anze Vavpetic's avatar
Anze Vavpetic committed
303
304
        self._template = []
        self._predicates = set()
305
        self._output_types = set()
306
307
        Converter.__init__(self, *args, **kwargs)

Anze Vavpetic's avatar
Anze Vavpetic committed
308

309
310
311
312
313
314
315
316
    def _row_pk(self, target, cols, row):
        row_pk = None
        for idx, col in enumerate(row):
            if cols[idx] == self.db.pkeys[target]:
                row_pk = col
                break
        return row_pk

Anze Vavpetic's avatar
Anze Vavpetic committed
317

318
    def _facts(self, pk, pk_att, target, visited=set(), parent_table='', parent_pk=''):
319
        '''
320
        Returns the facts for the given entity with pk in `target`.
321
322
        '''
        facts = []
323
        cols = self.db.cols[target]
324
        if target != self.db.target_table:
325
326
327

            # Skip the class attribute
            if self.db.target_att in cols:
328
                cols.remove(self.db.target_att)
329
330

            # All rows matching `pk`
331
            for row in self.db.select_where(target, cols, pk_att, pk):
332
333
                row_pk = self._row_pk(target, cols, row)
                row_pk_name = '%s%s' % (target, str(row_pk))
334
                parent_pk_name = '%s%s' % (parent_table, str(parent_pk))
335
336

                # Each attr-value becomes one fact
337
338
                for idx, col in enumerate(row):
                    attr_name = cols[idx]
339
340

                    # We give pks/fks a symbolic name based on the table and id
341
                    if attr_name in self.db.fkeys[target]:
342
343
344
345
346
                        origin_table = self.db.reverse_fkeys[(target, attr_name)]
                        if origin_table != self.db.target_table:
                            col = '%s%s' % (origin_table, str(col))
                        else:
                            continue
347
                    elif attr_name == self.db.pkeys[target]:
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
                        if parent_table and parent_table != self.db.target_table:
                            predicate = '%s_has_%s' % (parent_table, target)
                            predicate_template = '%s(+%s, -%s)' % (predicate,
                                                                   parent_table,
                                                                   target)
                            facts.append('%s(%s, %s)' % (predicate, 
                                                         parent_pk_name, 
                                                         row_pk_name))
                        else:
                            predicate = 'has_%s' % (target)
                            predicate_template = '%s(-%s)' % (predicate,
                                                              target)
                            facts.append('%s(%s)' % (predicate, row_pk_name))

                        output_type = '-%s' % target
                        if predicate_template not in self._predicates and \
                           output_type not in self._output_types:
                            self._output_types.add('-%s' % target)
                            self._predicates.add(predicate_template)
                            self._template.append(predicate_template)
368

369
370
                    # Constants
                    else:
Anze Vavpetic's avatar
Anze Vavpetic committed
371
                        predicate = 'has_%s' % attr_name
372

373
                        col = self._discretize_check(target, attr_name, col)
Anze Vavpetic's avatar
Anze Vavpetic committed
374
375
376
                        facts.append('%s(%s, %s)' % (predicate, 
                                                     row_pk_name,
                                                     str(col)))
377
378
379
380
381
382
383
                        predicate_template = '%s(+%s, #%s)' % (predicate,
                                                               target,
                                                               attr_name)
                    
                        if predicate_template not in self._predicates:
                            self._predicates.add(predicate_template)
                            self._template.append(predicate_template)
384
385

        # Recursively follow links to other tables
386
387
388
        for table in self.db.tables:
            if (target, table) not in self.db.connected:
                continue
389

390
            for this_att, that_att in self.db.connected[(target, table)]:
391
392
                if (target, table, this_att, that_att) not in visited:
                    visited.add((target, table, this_att, that_att))
393
                    
394
                    # Link case 1: this_att = pk_att is a fk in another table
395
                    if this_att == pk_att:
396
                        facts.extend(self._facts(pk,
397
398
                                                 that_att,
                                                 table, 
399
400
401
                                                 visited=visited,
                                                 parent_table=target,
                                                 parent_pk=pk))
402
403
                    
                    # Link case 2: this_att is a fk of another table
404
                    else:
405
                        fk_list = []
406
                        for row in self.db.select_where(target, [this_att]+cols, pk_att, pk):
407
408
409
                            row_pk = self._row_pk(target, cols, row[1:])
                            fk_list.append((row[0], row_pk))
                        for fk, row_pk in fk_list:
410
                            facts.extend(self._facts(fk,
411
412
                                                     that_att,
                                                     table, 
413
414
415
                                                     visited=visited,
                                                     parent_table=target,
                                                     parent_pk=row_pk))
416
417
        return facts

Anze Vavpetic's avatar
Anze Vavpetic committed
418

419
420
421
422
    def _discretize_check(self, table, att, col):
        '''
        Replaces the value with an appropriate interval symbol, if available.
        '''
423
        label = "'%s'" % col
424
425
426
427
428
429
430
431
432
433
434
        if table in self.discr_intervals and att in self.discr_intervals[table]:
            intervals = self.discr_intervals[table][att]
            n_intervals = len(intervals)

            prev_value = None
            for i, value in enumerate(intervals):

                if i > 0:
                    prev_value = intervals[i-1]

                if not prev_value and col <= value:
435
                    label = "'=<%.2f'" % value
436
437
                    break
                elif prev_value and col <= value:
438
                    label = "'(%.2f;%.2f]'" % (prev_value, value)
439
440
                    break
                elif col > value and i == n_intervals - 1:
441
                    label = "'>%.2f'" % value
442
                    break
443
444
445
446
        else:
            # For some reason using [ and ] crashes TreeLiker
            label = label.replace('[', 'I')
            label = label.replace(']', 'I')
447
448
449
450

        return label


451
    def dataset(self):
452
453
454
455
        '''
        Returns the db context as a list of interpretations, i.e., a list of 
        facts true for each example.
        '''
456
457
        target = self.db.target_table
        db_examples = self.db.rows(target, [self.db.target_att, self.db.pkeys[target]])
458

459
460
        examples = []
        for cls, pk in sorted(db_examples, key=lambda ex: ex[0]):
461
            facts = self._facts(pk, self.db.pkeys[target], target, visited=set())
462
463
464
465
            examples.append('%s %s' % (cls, ', '.join(facts)))

        return '\n'.join(examples)

Anze Vavpetic's avatar
Anze Vavpetic committed
466

467
    def default_template(self):
Anze Vavpetic's avatar
Anze Vavpetic committed
468
        return '[%s]' % (', '.join(self._template))
469

470

471
472
if __name__ == '__main__':
    from context import DBConnection, DBContext
473
474
475
    context = DBContext(DBConnection('ilp','ilp123','ged.ijs.si','muta_42'))
    context.target_table = 'drugs'
    context.target_att = 'active'
476
    conv = Aleph_Converter(context)