Commit f333e1f3 authored by Matic Perovšek's avatar Matic Perovšek

xml to text, select sbsections

wordification tests, additional mesures
mesh filter
open in crossbee widget
parent 4bed6381
......@@ -140,7 +140,7 @@ class NCBI_Extractor(object):
return doc
#end
def extractArticleText(self, root, did):
def extractArticleText(self, root, did,only_sections=None):
try:
titleNode = root.getElementsByTagName('article-title')[0]
except Exception:
......@@ -164,7 +164,7 @@ class NCBI_Extractor(object):
body = ''
print 'Warning: no body found, document %s' % str(did)
else:
body = self.list2text(self.recursiveCollect(bodyNode, []))
body = self.list2text(self.recursiveCollect(bodyNode, [],only_sections))
body = re.sub('(\[)[ ,-:;]*(\])', '', body)
ytags = root.getElementsByTagName('pub-date')
......@@ -185,12 +185,14 @@ class NCBI_Extractor(object):
#end
#
def recursiveCollect(self, node, result, skipTags=['title', 'xref', 'table', 'graphic', 'ext-link',
def recursiveCollect(self, node, result, only_sections=None,skipTags=['title', 'xref', 'table', 'graphic', 'ext-link',
'media', 'inline-formula', 'disp-formula']):
for child in node.childNodes:
if child.nodeType == dom.Node.ELEMENT_NODE:
if child.tagName not in skipTags:
self.recursiveCollect(child, result)
print child.getAttribute("sec-type"), child.tagName
if not only_sections or child.tagName!='sec' or not child.hasAttribute('sec-type') or child.getAttribute('sec-type') in only_sections:
self.recursiveCollect(child, result,only_sections)
elif child.nodeType == dom.Node.TEXT_NODE:
result.append(child.data)
#endfor
......@@ -198,6 +200,21 @@ class NCBI_Extractor(object):
return result
#end
def recursiveCollectET(self, node, result, only_sections=None,skipTags=['title', 'xref', 'table', 'graphic', 'ext-link',
'media', 'inline-formula', 'disp-formula']):
if node.tag not in skipTags:
#print node.attrib["sec-type"], node.tagName
if not only_sections or node.tag!='sec' or not node.attrib['sec-type'] or node.attrib['sec-type'] in only_sections:
if node.text and node.text.strip()!="":
result.append(node.text.replace('\n', ''))
if node.tail and node.tail.strip()!="":
result.append(node.tail.replace('\n', ''))
a=list(node)
for child in list(node):
self.recursiveCollectET(child, result,only_sections)
return result
def list2text(self, lst):
result = ''
for x in lst:
......
This diff is collapsed.
from django.shortcuts import render
def bio3graph_filter_integers(request,input_dict,output_dict,widget):
return render(request, 'interactions/bio3graph_filter_integers.html',{'widget':widget,'intList':input_dict['intList']})
\ No newline at end of file
return render(request, 'interactions/bio3graph_filter_integers.html',{'widget':widget,'intList':input_dict['intList']})
def bio3graph_xml_to_fulltext(request, input_dict, output_dict, widget):
from NCBI import NCBI_Extractor
#xmls = input_dict['xml_list']
# if not isinstance(xmls, list):
import xml.etree.ElementTree as ET
# tree = ET.fromstring(xmls)
# #root = tree.getroot()
#
# xml_elements=tree.findall('article')
# xmls=[ET.tostring(el) for el in xml_elements]
#
file_name = input_dict['xml_file']
import time
timea=time.time()
print file_name
num_of_all_articles=0
with open(file_name,'r') as f:
for line in f:
if "</article>" in line:
num_of_all_articles+=1
print time.time()-timea
print num_of_all_articles
article_count=0
sections = []
#
import xml.dom.minidom as dom
# for xml in xmls:
# root = dom.parseString(xml)
# sections |= set([sec.getAttribute("sec-type") for sec in root.getElementsByTagName('sec') if sec.hasAttribute('sec-type')])
def get_title(elem):
txt=''
if elem.text:
txt+=elem.text.strip()
for child in elem._children: #only one level
if child.text:
txt+=child.text.strip()
if child.tail:
txt+=child.tail.strip()
if elem.tail:
txt+=elem.tail.strip()
return txt.lower()
from xml.etree.ElementTree import XMLParser
parser = XMLParser(encoding="utf-8")
#optionstree = ET.parse("test.conf", parser=parser)
if True:
path=[]
with open(file_name,'r') as f:
ignore_title=False
#with open("D:/diagonalization/glio_aml/domain1/1062151.xml") as f:
for event, elem in ET.iterparse(f,events=('start','end')):
if event=="start":
if elem.tag == "sec" and not "sec" in path:
if 'sec-type' in elem.attrib:
sections.append(elem.attrib["sec-type"].lower())
ignore_title=True
#titles of others
elif elem.tag=="title" and not ignore_title and path.count("sec")==1:
sections.append(get_title(elem))
path.append(elem.tag)
elif event=="end":
if elem.tag=="article":
article_count+=1
print article_count,"/",num_of_all_articles
path.pop()
if elem.tag == "sec" and not "sec" in path:
ignore_title=False
elem.clear()
from collections import Counter
#print len(sections),sections
#print Counter(sections)
section_names= ['article title','abstract','figure captions','table captions']
if article_count>0:
section_names+=[a[0]+"::"+str(a[1]) for a in Counter(sections).most_common(None)]
return render(request, 'visualizations/xml_to_fulltext.html',
{'widget': widget, 'section_names': section_names,'num_of_all_articles':num_of_all_articles})
def mesh_filter(request, input_dict, output_dict, widget):
import json
from os.path import normpath, join, dirname
categories=json.load(open(normpath(join(dirname(__file__), 'data/toplevels.json')))) #{'a':['a1','a2','a3'],'b':['b1','b2']}
return render(request, 'visualizations/mesh_filter.html', {'categories': categories, 'widget':widget})
\ No newline at end of file
......@@ -233,7 +233,6 @@ def bio3graph_get_xmls(input_dict):
result.append(a.getXML(did))
return {'xmls': result}
def bio3graph_get_fulltexts(input_dict):
from NCBI import NCBI_Extractor
......@@ -250,6 +249,112 @@ def bio3graph_get_fulltexts(input_dict):
return {'fulltexts': result}
def bio3graph_xml_to_fulltext(input_dict):
return {}
def bio3graph_xml_to_fulltext_finished(postdata, input_dict, output_dict):
file_name = input_dict['xml_file']
output_file_name=file_name+".new"
#if not isinstance(xmls, list):
# xmls = [xmls]
num_of_all_articles=postdata.get('num_of_all_articles')[0]
article_count=0
from NCBI import NCBI_Extractor
a = NCBI_Extractor()
widget_id = postdata.get('widget_id')[0]
sections = postdata.get('section_names%s' % widget_id)
sections=[s.replace("figure captions","fig").replace("table captions","table-wrap").replace("article title","title-group").split("::")[0] for s in sections]
def get_title(elem):
txt=''
if elem.text:
txt+=elem.text.strip()
for child in elem._children: #only one level
if child.text:
txt+=child.text.strip()
if child.tail:
txt+=child.tail.strip()
if elem.tail:
txt+=elem.tail.strip()
return txt.lower()
def write_to_results(elem_tag,text,results,path,write_from_level,block_from_level):
if len(path)>=write_from_level and not len(path)>=block_from_level:
if text and text.replace('\n', '').strip()!="":
results.append(text.replace('\n', ''))
if not elem_tag in ['bold','underline','italic','sub','sup']:
results.append(" ")
return None
import xml.etree.ElementTree as ET
import re
def writing_element(elem,sections):
if elem.tag=='sec':
return 'sec-type' in elem.attrib and elem.attrib['sec-type'] in sections
else:
return elem.tag in sections
results=[]
skipTags=['title','xref', 'table', 'graphic', 'ext-link', 'media', 'inline-formula', 'disp-formula','label']
with open(file_name) as f:
with open(output_file_name,"w") as output_file:
#with open("D:/diagonalization/glio_aml/domain1/1062151.xml") as f:
path=[]
tails=[]
write_from_level=100
block_from_level=100
for event, elem in ET.iterparse(f,events=("start","end")):
if event=="start":
path.append(elem.tag)
tails.append(elem.tail)
#ancestors.add(elem)
if elem.tag == "article":
write_from_level=100
block_from_level=100
else:
if elem.tag in skipTags:
block_from_level=min([block_from_level,len(path)])
if elem.tag=='sec' and 'sec-type' in elem.attrib and elem.attrib['sec-type'] in sections:
write_from_level=min([len(path)+1,write_from_level])
elif elem.tag in sections: #abstract
write_from_level=min([len(path),write_from_level])
elif elem.tag=="title" and get_title(elem) in sections:
write_from_level=min([len(path)-1,write_from_level])
if elem.tag=="underline":
stop=True
#res=""
write_to_results(elem.tag,elem.text,results,path,write_from_level,block_from_level)
elif event=="end":
tail=tails.pop()
path.pop()
write_to_results(elem.tag,tail,results,path,write_from_level,block_from_level)
if len(path)<write_from_level:
write_from_level=100
if len(path)<block_from_level:
block_from_level=100
if elem.tag=="article":
body = ''.join(results) #
#a.list2text(results)
body = re.sub('(\[)[ ,-:;]*(\])', '', body)
body=body.replace(" "," ").replace(" ( )","").replace(" .",".").replace(" ,",",")+"\n"
output_file.write(body)
results=[]
article_count+=1
print article_count,"/",num_of_all_articles
elem.clear()
return {'output_file' : output_file_name}
def bio3graph_map_entrez_to_ncbi_symbol(input_dict):
import cPickle
from os.path import normpath, join, dirname
......@@ -291,10 +396,31 @@ def bio3graph_construct_compounds_from_gene_synonyms(input_dict):
def mesh_filter(input_dict):
return {'output_file':'svoboden kot pticek na veji'}
def mesh_filter_finished(postdata, input_dict, output_dict):
import cPickle
from os.path import normpath, join, dirname
widget_id = postdata.get('widget_id')[0]
selected_categories=postdata.get('selected[]')
terms_per_category=cPickle.load(open(normpath(join(dirname(__file__),'data/terms_per_category.pickle'))))
terms=set()
for category in selected_categories:
if not category in selected_categories:
print "aaa"
terms |= terms_per_category[category]
import time
unique_filename=time.strftime("%Y-%m-%d-%H-%M-%S")
output_file_name="C:/Users/matic/workspace/iClowdFlow/mothra/public/files/1/terms_"+str(unique_filename)+".txt"
with open(output_file_name,'w') as of:
for term in terms:
of.write("%s\n" % term)
return {'output_file':output_file_name}
......
This diff is collapsed.
......@@ -6,3 +6,16 @@ def crossbee_display_summation(request,input_dict,output_dict,widget):
else:
check = 'The calculation appears incorrect!'
return render(request, 'visualizations/crossbee_display_integers.html',{'widget':widget,'input_dict':input_dict, 'output_dict':output_dict, 'check':check})
def open_data_in_crossbee(request,input_dict,output_dict,widget):
#from mothra.settings import MEDIA_ROOT
#from workflows.helpers import ensure_dir
#destination = MEDIA_ROOT+'/'+str(request.user.id)+'/'+str(widget.id)+'.txt'
#ensure_dir(destination)
#f = open(destination,'w')
#f.write(str(input_dict['string']))
#f.close()
#filename = str(request.user.id)+'/'+str(widget.id)+'.txt'
#output_dict['filename'] = filename
return render(request, 'visualizations/open_data_in_crossbee.html',{'widget':widget}) #,'input_dict':input_dict,'output_dict':output_dict})
......@@ -89,18 +89,21 @@ def ilp_sdmaleph(input_dict):
def ilp_wordification(input_dict):
target_table = input_dict.get('target_table',None)
other_tables = input_dict.get('other_tables', None)
measure = input_dict.get('measure', None)
context = input_dict.get('context', None)
word_att_length = int(input_dict.get('f_ngram_size', 1))
for _ in range(1):
wordification = Wordification(target_table,other_tables,context,word_att_length)
wordification.run(1)
wordification.calculate_tf_idfs(False)
wordification.calculate_tf_idfs(measure)
#wordification.prune(50)
#wordification.to_arff()
#from wordification import Wordification_features_test
#wft=Wordification_features_test(target_table,other_tables,context)
#wft.print_results()
if 1==0:
from wordification import Wordification_features_test
wft=Wordification_features_test(target_table,other_tables,context)
wft.print_results()
return {'arff' : wordification.to_arff(),'corpus': wordification.wordify()}
......@@ -117,4 +120,4 @@ def ilp_treeliker(input_dict):
'max_degree': input_dict.get('max_degree')
}
arff = TreeLiker(dataset, template).run(settings=settings)
return {'arff': arff}
\ No newline at end of file
return {'arff': arff}
from wordification import Wordification
#from wordification_features_text import Wordification_features_test
from wordification_features_text import Wordification_features_test
......@@ -188,7 +188,7 @@ class Wordification(object):
#self.resulting_documents.append(self.wordify_example(self.target_table,ex,set([])))
def calculate_tf_idfs(self,tf_only=False):
def calculate_tf_idfs(self,measure):
"""
Counts word frequency and calculates tf-idf values for words in every document.
"""
......@@ -215,7 +215,7 @@ class Wordification(object):
for word in document:
tf=train_word_count[word]
idf = 1 if tf_only else log(no_of_documents / float(self.word_in_how_many_documents[word]))
idf = 1 if measure=="tf" else log(no_of_documents / float(self.word_in_how_many_documents[word]))
self.tf_idfs[doc_idx][word] = tf * idf
......@@ -234,7 +234,7 @@ class Wordification(object):
arff_string+="@ATTRIBUTE\tclass\t{"+string.join(set([str(a) for a in self.resulting_classes]),",")+"}\n\n@DATA\n"
for doc_idx in range(len(self.resulting_documents)):
print doc_idx
#print doc_idx
features=[]
for word in words:
if word in self.tf_idfs[doc_idx]:
......
__author__ = 'matic'
from collections import defaultdict
from wordification import Wordification
from services.webservice import WebService
def get_cross_validation_accuracy(arff):
acs=[]
for a in range(10):
j48 = WebService("http://vihar.ijs.si:8092/Classification?wsdl")
j48_response = j48.client.J48(params="")
j48_learner = j48_response['J48_learner']
arff2weka = WebService("http://vihar.ijs.si:8092/Utilities?wsdl")
arff2weka_response = arff2weka.client.arff_to_weka_instances(arff=arff,class_index="")
instances = arff2weka_response['instances']
cv = WebService("http://vihar.ijs.si:8092/Evaluation?wsdl",timeout=600)
cv_response = cv.client.cross_validate(learner=j48_learner,instances=instances,folds=5)
accuracy = cv_response['accuracy']
acs.append(float(accuracy))
return sum(acs)*1./len(acs)
def split_string_to_words(string):
documents=string.split("\n")
words=set([])
for document in documents:
#print document.split(" ")
words |= set(document.split(" "))
#print words
return words
def prunning_name(perc):
if perc:
return "Prunning "+str(perc)+"%"
else:
return "No prunning"
class Wordification_features_test(object):
def __init__(self,target_table,other_tables,context):
self.max_witem_length=6
self.results=[]
self.feature_counts=[]
self.accuracies=[]
for prunning_percentage in [None,20,40]:
rez_a=[]
rez_c=[]
for word_att_length in range(1,self.max_witem_length):
print "percentage:",prunning_percentage,"witem:",word_att_length
pruned=Wordification(target_table,other_tables,context,word_att_length)
pruned.run()
pruned.calculate_tf_idfs("tfidf")
if prunning_percentage:
pruned.prune(prunning_percentage)
wordification_string=pruned.wordify()
rez_c.append(len(split_string_to_words(wordification_string)))
a= pruned.to_arff()
rez_a.append(get_cross_validation_accuracy(a))
self.feature_counts.append([prunning_percentage,rez_c])
self.accuracies.append([prunning_percentage,rez_a])
# import os
# os.environ.setdefault("DJANGO_SETTINGS_MODULE", "mothra.settings")
# from services.webservice import WebService
#
# def weka_stuff_todo(self):
# input_dict={}
# f = input_dict['file']
# fname = os.path.basename(input_dict['file'])
# wsdl = input_dict.get('wsdl', 'http://vihar.ijs.si:8092/Evaluation?wsdl')
# ws = WebService(wsdl, 60000)
# response = ws.client.cross_validate(fileName=fname, inFile=data)
def print_results(self):
import matplotlib.pyplot as plt
#len(self.nonpruned_feature_count)
#x = np.linspace(1, len(self.results))
#print self.nonpruned_feature_count
#print self.pruned_feature_count
for prunning_percentage,feature_counts in self.feature_counts:
plt.plot(range(1,len(feature_counts)+1), feature_counts, '-',marker='o', linewidth=2,label=prunning_name(prunning_percentage))
#plt.plot(range(1,len(self.pruned_feature_count)+1), self.pruned_feature_count, '-',marker='o', linewidth=2)
#plt.plot(x, y, 'k')
#plt.title('Damped exponential decay')#, fontdict=font)
#plt.text(2, 0.65, r'$\cos(2 \pi t) \exp(-t)$')#, fontdict=font)
plt.xlabel('Number of witems per word')#, fontdict=font)
plt.ylabel('Number of generated features')#, fontdict=font)
plt.legend(title="Prunning percentage", loc="best")
plt.grid()
#plt.show()
plt.savefig('featuresWitems.jpg')
plt.clf()
for prunning_percentage,accuracies in self.accuracies:
plt.plot(range(1,len(accuracies)+1), accuracies, '-',marker='o', linewidth=2,label=prunning_name(prunning_percentage))
plt.xlabel('Number of witems per word')#, fontdict=font)
plt.ylabel('Classification accuracy')#, fontdict=font)
plt.legend(title="Prunning percentage", loc="best")
plt.grid()
plt.savefig('accuraciesWitems.jpg')
plt.clf()
print self.feature_counts
print self.accuracies
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment