mapper.py 4.89 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
# Mapping unseen relational examples to an existing propositionalized domain
import tempfile
import subprocess
import os
import re
import arff

from converters import RSD_Converter, TreeLikerConverter


def _feature_numbers(features):
    n = len(features.splitlines())
    featureIDs = map(lambda id: str(id), range(1, n+1))
    return 'featureIDs([%s]).' % (','.join(featureIDs))


example_id_pattern = re.compile(r', (?P<id>.+)\)\.')
def _example_ids(pred, examples):
    exampleIDs = example_id_pattern.findall(examples, re.M)
    return '%s([%s]).' % (pred, ','.join(exampleIDs))


def domain_map(features, feature_format, train_context, test_context,
               intervals={},
               format='arff',
               positive_class=None):

    dataset = None
    if feature_format in ['rsd', 'aleph']:
        train_rsd = RSD_Converter(train_context)
        test_rsd = RSD_Converter(test_context, discr_intervals=intervals)
32
        mapper_target_name = train_context.target_table + '_mapper'
33
34
        train_examples = train_rsd.all_examples(pred_name=mapper_target_name)
        test_examples = test_rsd.all_examples(pred_name=mapper_target_name)
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
        
        if feature_format == 'aleph':
            features = aleph_to_rsd_features(features)

        prolog_bk = '\n'.join([
            _example_ids('testExampleIDs', test_examples),
            '%% test examples',
            test_examples,
            '%% train examples',
            train_examples, 
            '%% train background knowledge',
            train_rsd.background_knowledge(),
            '%% test background knowledge',
            test_rsd.background_knowledge(),
            _feature_numbers(features),
            '%% features',
            features,
        ])
        THIS_DIR = os.path.dirname(__file__) if os.path.dirname(__file__) else '.'
        f = tempfile.NamedTemporaryFile(delete=False)
        f.write(prolog_bk)
        f.close()
57
        cmd_args = ['yap', '-L', '--', '%s/mapper.pl' % THIS_DIR, f.name, mapper_target_name]
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
        evaluations = subprocess.check_output(cmd_args)
        dataset = dump_dataset(features, feature_format, evaluations,
                               train_context,
                               format=format,
                               positive_class=positive_class)

    elif feature_format == 'treeliker':
        # We provide treeliker with the test dataset
        # since it has a built-in ability to evaluate features
        treeliker_test = TreeLikerConverter(test_context, 
                                            discr_intervals=intervals)
        treeliker = features
        treeliker.test_dataset = treeliker_test.dataset()
        _, test_dataset = treeliker.run()

        if format == 'arff':
            dataset = test_dataset
        else:
            return 'unsupported format'
    
    return dataset


def dump_dataset(features, feature_format, evaluations, train_context,
                 format='arff',
                 positive_class=None):
    if format == 'arff':
        data = {
            'attributes': [],
            'data': [],
            'description': '',
            'relation': 'default'
        }
        n_features = len(features.splitlines())
        for i in range(1, n_features + 1):
            feature = ('f%d' % i, ['+', '-'])
            data['attributes'].append(feature)

        target = train_context.target_table
        if not target in train_context.orng_tables:
            raise Exception('Target table is not preloaded in memory! Please select the `dump data` parameter in the converter widget.')
        if feature_format == 'aleph':
            target_vals = ('negative', 'positive')
        else:
            orng_target = train_context.orng_tables[target]
103
            target_vals = tuple(sorted(orng_target.domain.classVar.values))
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
        class_attr = ('class', target_vals)
        data['attributes'].append(class_attr)
        for line in evaluations.splitlines():
            values = line.strip().split()
            if feature_format == 'aleph':
                class_val = values[-1]
                if class_val == positive_class:
                    values[-1] = 'positive'
                else:
                    values[-1] = 'negative'
            data['data'].append(values)
        return arff.dumps(data)

    elif format == 'csv':
        data = ''
        for line in evaluations.splitlines():
            values = line.strip().split()
            data = data + ','.join(values) + '\n'
        return data

    return 'unsupported format'


def aleph_to_rsd_features(features):
    converted_features = []
    for line in features.splitlines():
        if not line.startswith('feature'):
            continue

        feature_id = len(converted_features) + 1
        feature_body = line[line.find(':-'):line.find(')).')] + '.'
        new_feature = 'f(%d, A)%s' % (feature_id, feature_body)
        converted_features.append(new_feature)

    return '\n'.join(converted_features)