Commit e109589f authored by Matic Perovšek's avatar Matic Perovšek

wrdf proper idf

parent d352f82a
......@@ -93,9 +93,10 @@ def ilp_wordification(input_dict):
weighting_measure = input_dict.get('weighting_measure', 'tfidf')
context = input_dict.get('context', None)
word_att_length = int(input_dict.get('f_ngram_size', 1))
idf=input_dict.get('idf', None)
for _ in range(1):
wordification = Wordification(target_table,other_tables,context,word_att_length)
wordification = Wordification(target_table,other_tables,context,word_att_length,idf)
wordification.run(1)
wordification.calculate_tf_idfs(weighting_measure)
#wordification.prune(50)
......@@ -105,7 +106,7 @@ def ilp_wordification(input_dict):
from wordification import Wordification_features_test
wft=Wordification_features_test(target_table,other_tables,context)
wft.print_results()
return {'arff' : wordification.to_arff(),'corpus': wordification.wordify()}
return {'arff' : wordification.to_arff(),'corpus': wordification.wordify(),'idf':wordification.idf}
def ilp_treeliker(input_dict):
......
......@@ -24,6 +24,23 @@
"name": "Wordification"
}
},
{
"model": "workflows.abstractinput",
"fields": {
"widget": "63f936a1-6841-44c5-8340-8e4f3fef4f6a",
"name": "Inverse Document Frequencies",
"short_name": "idf",
"default": "",
"description": "Inverse Document Frequencies which will be used for feature calculation.",
"required": false,
"multi": false,
"parameter_type": null,
"variable": "idf",
"parameter": false,
"order": 6,
"uid": "3a9a9e51-9ad2-4d6a-880a-8145b824a32b"
}
},
{
"model": "workflows.abstractinput",
"fields": {
......@@ -37,7 +54,7 @@
"parameter_type": null,
"variable": "other_tables",
"parameter": false,
"order": 3,
"order": 4,
"uid": "72854a26-f7bf-46a3-988e-61caaeabdc7a"
}
},
......@@ -54,7 +71,7 @@
"parameter_type": null,
"variable": "context",
"parameter": false,
"order": 4,
"order": 5,
"uid": "8f02978b-e0d7-45c5-ab74-6a0e8728654d"
}
},
......@@ -71,7 +88,7 @@
"parameter_type": "text",
"variable": "f_ngram_size",
"parameter": true,
"order": 2,
"order": 3,
"uid": "b5eea832-1712-4db1-a677-92deca4e7a75"
}
},
......@@ -88,7 +105,7 @@
"parameter_type": "select",
"variable": "weighting_measure",
"parameter": true,
"order": 1,
"order": 2,
"uid": "cc2a208c-bf6f-4f64-b51e-221c80366f8e"
}
},
......@@ -109,6 +126,18 @@
"uid": "f1b77c89-ed7b-4181-9f62-c2001fd7c388"
}
},
{
"model": "workflows.abstractoutput",
"fields": {
"widget": "63f936a1-6841-44c5-8340-8e4f3fef4f6a",
"name": "Inverse Document Frequencies",
"short_name": "idf",
"description": "Calculated Inverse Document Frequencies",
"variable": "idf",
"order": 3,
"uid": "3cea08cf-98d5-4be2-8f0a-68ec8cd13b90"
}
},
{
"model": "workflows.abstractoutput",
"fields": {
......
from collections import defaultdict
from math import log
import string,itertools
import multiprocessing
......@@ -89,7 +90,7 @@ def att_to_s(att):
class Wordification(object):
def __init__(self,target_table,other_tables,context,word_att_length):
def __init__(self,target_table,other_tables,context,word_att_length,idf=None):
"""
Wordification object constructor.
......@@ -100,6 +101,8 @@ class Wordification(object):
self.other_tables=other_tables
self.context=context
self.word_att_length=word_att_length
self.idf=idf
#self.minimum_word_frequency=minimum_word_frequency
self.connecting_tables=defaultdict(list)
......@@ -178,17 +181,11 @@ class Wordification(object):
"""
from math import log
#TODO replace with spipy matrices (and calculate with scikit)
print "compute tf-idf"
words = set()
for document in self.resulting_documents:
for word in document:
words.add(word)
for document in self.resulting_documents:
for word in set(document):
self.word_in_how_many_documents[word]+=1
no_of_documents=len(self.resulting_documents)
if measure=='tfidf':
self.calculate_idf()
for doc_idx, document in enumerate(self.resulting_documents):
#print str(doc_idx)
......@@ -199,9 +196,27 @@ class Wordification(object):
for word in document:
tf=train_word_count[word]
idf = 1 if measure=="tf" else log(no_of_documents / float(self.word_in_how_many_documents[word]))
idf = 1 if measure=="tf" else (self.idf[word] if word in self.idf else None)
if word=='Cars_Position_3':
idf+=100
if idf!=None:
self.tf_idfs[doc_idx][word] = tf * idf
def calculate_idf(self):
if self.idf:
return self.idf
elif len(self.word_in_how_many_documents)!=0:
raise Exception('Words in document occurence already calculated!')
else:
for document in self.resulting_documents:
for word in set(document):
self.word_in_how_many_documents[word]+=1
no_of_documents=len(self.resulting_documents)
self.idf={}
for word,count in self.word_in_how_many_documents.items():
self.idf[word]=log(no_of_documents / float(self.word_in_how_many_documents[word]))
self.tf_idfs[doc_idx][word] = tf * idf
def to_arff(self):
print "begin to_arff"
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment