csv2arff.py 3.88 KB
Newer Older
Izidorf's avatar
Izidorf committed
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
# -*- coding: cp1252 -*-
import csv
import sys
from xml.dom import minidom






def get_attributes(file_xml):
    out = []
    dom1 = minidom.parse(file_xml)
    for node in dom1.getElementsByTagName('attribute'):
        out.append({
            'name': node.getAttribute('name') ,
            'atype': node.getAttribute('atype'),
            'format':node.getAttribute('format'),
            'skip':node.getAttribute('skip')

        })
    #print out
    return out

def get_relation(file_xml):
    dom1 = minidom.parse(file_xml)
    out=''
    delimiter=''
    for node in dom1.getElementsByTagName('csv'):
        out=node.getAttribute('name')
        delimiter=node.getAttribute('delimiter');
    if(len(delimiter)==0):
        delimiter=';';
    print delimiter   
    return out, delimiter


class csv_arff_converter:
    
    def __init__(self,csv_file, attribute_file, file_out):
        self.csv_file = csv_file
        self.attribute_file = attribute_file
        self.file_out = file_out

    def run(self):

        classes = []
        
        #read attribute
        self.relation_name, self.delimiter = get_relation(attribute_file)
        attributes_list = get_attributes(attribute_file)
        arff_data = '@RELATION ' + self.relation_name + '\n\n'
         

        for i in attributes_list:
            if (i['skip'] != 'yes'):
                arff_data += '@ATTRIBUTE '+i['name']+' ' + i['atype']
                if (i['atype']=='date'):
                    arff_data += ' '+i['format']
                if (i['atype']=='class'):
                    arff_data += ' (#@#'+i['name'] + '#@#)'
                
                arff_data +='\n'
                classes.append('')
             


        arff_data += '\n@DATA\n'
        print classes 
        #open csv
        reader = csv.reader(open(self.csv_file), delimiter=self.delimiter, quoting=csv.QUOTE_NONE)

        rnum = 0     


        for row in reader:
            #print row
            buff = ''
            pos = 0
            #print len(row)
            #occhio alla lunghezza riga
            for j in range(0, len(row)-1):
                field = row[j]
                
                if(attributes_list[pos]['skip'] != 'yes'):
                
                    if (pos > 0):
                        buff += ','
                    if(attributes_list[pos]['atype'] == 'string'):
                        field = "'" + field + "'"
                    buff += field
                    #se  una classe raccolgo i valori
                    if(attributes_list[pos]['atype'] == 'class'):
                        if (rnum > 0):
                            classes[pos]+= ','+ field
                        else:
                            classes[pos]+=  field
                        
                pos += 1
            buff += '\n'
            arff_data += buff
            
            rnum += 1

        
        pos = 0
        for a in classes:
            j = a.split(',')
            
            un = list(set(j))
            #print un
            if (len(un) > 0):
                this_replacement =  ",".join(un)
                #print this_replacement
                old_text = '#@#'+ attributes_list[pos]['name'] + '#@#'
                #print old_text
                arff_data = arff_data.replace(old_text, this_replacement)
            pos += 1

        #print arff_data
        a = open(self.file_out, 'w')
        a.write(arff_data)
        a.close()
        
                
                



if __name__ == "__main__":
    #csv_file = sys.argv[1]
    #attribute_file = sys.argv[2]
    csv_file = './test_csv2arff/test_dataset_1.csv'
    attribute_file = './test_csv2arff/test_dataset_1.att'
        
    instance = csv_arff_converter(csv_file, attribute_file, './test_csv2arff/output.arff')
    instance.run()