Commit 44a5255e authored by Janez K's avatar Janez K
Browse files

Merge branch 'dev' of source.ijs.si:mothra into dev

parents 973f6c3d bd922c6a
...@@ -875,7 +875,7 @@ ...@@ -875,7 +875,7 @@
} }
}, },
{ {
"pk": 147, "pk": 153,
"model": "workflows.abstractwidget", "model": "workflows.abstractwidget",
"fields": { "fields": {
"category": 22, "category": 22,
...@@ -901,10 +901,10 @@ ...@@ -901,10 +901,10 @@
} }
}, },
{ {
"pk": 352, "pk": 360,
"model": "workflows.abstractinput", "model": "workflows.abstractinput",
"fields": { "fields": {
"widget": 147, "widget": 153,
"name": "Input string", "name": "Input string",
"short_name": "str", "short_name": "str",
"uid": "8436f5de-560f-427d-bb3e-b5e3774b56e7", "uid": "8436f5de-560f-427d-bb3e-b5e3774b56e7",
...@@ -919,10 +919,10 @@ ...@@ -919,10 +919,10 @@
} }
}, },
{ {
"pk": 165, "pk": 172,
"model": "workflows.abstractoutput", "model": "workflows.abstractoutput",
"fields": { "fields": {
"widget": 147, "widget": 153,
"name": "Object", "name": "Object",
"short_name": "obj", "short_name": "obj",
"variable": "evaluation_result", "variable": "evaluation_result",
...@@ -931,6 +931,63 @@ ...@@ -931,6 +931,63 @@
"description": "Result of evaluation" "description": "Result of evaluation"
} }
}, },
{
"pk": 206,
"model": "workflows.abstractwidget",
"fields": {
"category": 22,
"treeview_image": "",
"name": "Ravel list",
"is_streaming": false,
"uid": "ac754eed-37a4-48bc-b837-2f3db3f8f547",
"interaction_view": "",
"image": "",
"package": "base",
"static_image": "",
"post_interact_action": "",
"user": null,
"visualization_view": "",
"streaming_visualization_view": "",
"action": "base_ravel_list",
"wsdl_method": "",
"wsdl": "",
"interactive": false,
"has_progress_bar": false,
"order": 1,
"description": "Ravels and removes empty sublists from the input list. Useful in for loops."
}
},
{
"pk": 434,
"model": "workflows.abstractinput",
"fields": {
"widget": 206,
"name": "List",
"short_name": "lst",
"uid": "c35deb32-ca39-435a-b859-50e4c94a7c74",
"default": "",
"required": true,
"multi": false,
"parameter_type": null,
"variable": "input_list",
"parameter": false,
"order": 1,
"description": "Input list"
}
},
{
"pk": 225,
"model": "workflows.abstractoutput",
"fields": {
"widget": 206,
"name": "List",
"short_name": "lst",
"variable": "clean_list",
"uid": "27187dd2-3ce4-4abe-bfec-f96288c342f3",
"order": 1,
"description": "Output list"
}
},
{ {
"pk": 126, "pk": 126,
"model": "workflows.abstractwidget", "model": "workflows.abstractwidget",
......
...@@ -123,4 +123,21 @@ def base_safe_eval_string(input_dict): ...@@ -123,4 +123,21 @@ def base_safe_eval_string(input_dict):
raise Exception('Cannot evaluate string (remember, for safety reasons only literal structures can be evaluated: strings, numbers, tuples, lists, dicts, booleans, and None)') raise Exception('Cannot evaluate string (remember, for safety reasons only literal structures can be evaluated: strings, numbers, tuples, lists, dicts, booleans, and None)')
else: else:
return {'evaluation_result': result} return {'evaluation_result': result}
#end
def base_ravel_list(input_dict):
def ravel(data, result):
for x in data:
if not isinstance(x, list):
result.append(x)
else:
if x:
ravel(x, result)
#end
ilist = input_dict['input_list']
result = []
ravel(ilist, result)
return {'clean_list': result}
#end #end
\ No newline at end of file
import urllib
import urllib2
import csv
import cStringIO
import os
import cPickle
import time
from os.path import join, normpath, dirname
# http://gpsdb.expasy.org/cgi-bin/gpsdb/show?name=interleukin%208&model=Homo%20sapiens&type=gene&format=txt
# gpsdb.expasy.org/cgi-bin/gpsdb/show?name=indy&format=txt
#http://gpsdb.expasy.org/cgi-bin/gpsdb/show?model=Homo+sapiens&type=gene&name=interleukin+8&format=txt
GPSDB_URL = 'http://gpsdb.expasy.org/cgi-bin/gpsdb/show?'
NAME = 'name'
MODEL = 'model'
TYPE = 'type'
FORMAT = 'format'
MODEL_HUMAN = 'Homo sapiens'
TYPE_GENE = 'gene'
TXT_FORMAT = 'txt'
#entrez2symbol = cPickle.load(open('mappings/e2symb'))
#entrez2synonyms = cPickle.load(open('mappings/e2syns'))
class Synonym_extractor(object):
qwait = 0.5
cache_fname = 'data/gene_synonyms_cache'
def __init__(self):
self.last_qdate = 0
try:
self._cache = cPickle.load(open(normpath(join(dirname(__file__), self.cache_fname))))
except Exception:
self._cache = {}
#end
def _flush_cache(self):
#fp = open(self.cache_fname, 'wb')
fp = open(normpath(join(dirname(__file__), self.cache_fname)), 'wb')
cPickle.dump(self._cache, fp)
fp.close()
#end
def __del__(self):
self._flush_cache()
def wait(self):
# 2 requests per second (for now)
td = time.time() - self.last_qdate
if td < self.qwait:
print 'sleeping for %.2f seconds' % (self.qwait - td)
time.sleep(self.qwait - td)
#end
def make_query_url(self, geneSymbol):
params = {NAME: geneSymbol, MODEL: MODEL_HUMAN, TYPE: TYPE_GENE, FORMAT: TXT_FORMAT}
qstring = GPSDB_URL + urllib.urlencode(params)
return qstring
#end
def get_gene_synonyms(self, geneSymbol):
if geneSymbol in self._cache:
return self._cache[geneSymbol]
self.wait()
qstring = self.make_query_url(geneSymbol)
try:
fs = cStringIO.StringIO(urllib2.urlopen(urllib2.Request(qstring)).read())
#print fs.getvalue()
table = csv.reader(fs, delimiter='\t')
except Exception, e:
print e.message
return []
synonyms = [row[0] for row in table]
synonyms = [x.lower() for x in synonyms]
synonyms = list(set(synonyms))
self._cache.update({geneSymbol: synonyms})
return synonyms
#end
def get_geneset_synonyms(self, genes):
assert(isinstance(genes, list))
result = {}
for gene in genes:
result[gene] = self.get_gene_synonyms(gene)
return result
#end
#end class
#res['A']['gsea', 'fisher', 'all', 'page'][1, 2, 3,...]['topGenes', 'terms', 'allGenes', 'scores']
# ['z_score', 'page_p', 'gsea_p', 'enrichment_score', 'unadjusted_p', 'fisher_p', 'aggregate_p']
# def get_synonyms_for_results(resultsFile):
# assert(os.path.isfile(resultsFile))
#
# res = cPickle.load(open(resultsFile))
# geneSets = {}
# for ruleNum in res['A']['all'].keys():
# rule = res['A']['all'][ruleNum]
# geneSets[ruleNum] = rule['topGenes']
#
#
# synonyms = {}
# se = Synonym_extractor()
# for ruleNum in geneSets.keys():
# synonyms[ruleNum] = se.get_geneset_synonyms(geneSets[ruleNum])
#
# fp = open('result_genes2syns.pickle', 'wb')
# cPickle.dump(synonyms, fp, cPickle.HIGHEST_PROTOCOL)
# fp.close()
# print 'written rule %d, genes: %d' % (ruleNum, len(synonyms[ruleNum]))
# return synonyms
# #end
#
#
# def synMapping():
# res_syns = cPickle.load(open('data/result_genes2syns.pickle'))
# syn_map = {}
# for ruleNum in res_syns.keys():
# for egid in res_syns[ruleNum].keys():
# if egid not in syn_map:
# syn_map[egid] = res_syns[ruleNum][egid]
# return syn_map
#
# #a = synMapping()
# #fp = open('data/gene2syn', 'wb')
# #cPickle.dump(a, fp, cPickle.HIGHEST_PROTOCOL)
# #fp.close()
#e2syns = cPickle.load(open('mappings/e2syns'))
#result_syns = cPickle.load(open('data/result_genes2syns.pickle'))
#for ruleNum in result_syns.keys():
#for egid in result_syns[ruleNum].keys():
#gpsdb = result_syns[ruleNum][egid]
#ncbi = list(set(e2syns[egid]))
#joined = list(set(gpsdb).union(set(ncbi)))
#diff = set(ncbi).difference(gpsdb)
##print ncbi
#if diff:
#print diff
#print 'gpsdb: %d, ncbi: %d, total: %d' %(len(gpsdb), len(ncbi), len(joined))
############
# a = Synonym_extractor()
# s = a.get_geneset_synonyms(['lgals3', 'sfpq', 'ddx39b', 'srsf11', 'cir1', 'luc7l3', 'prpf39', 'hnrnph1', 'lgals3', 'sfpq', 'ddx39b'])
##a = get_synonyms_for_results('data/ALL-FINAL.pickle')
import urllib2
import urllib
import os
from xml.etree import cElementTree as tree
import time
import pickle
import re
import xml.dom.minidom as dom
import cPickle
import csv
import codecs
import unidecode
class Document(object):
def __init__(self):
self.docid = None
self.year = None
self.title = None
self.abstract = None
self.body = None
#self.text = None
self.xml = None
#end
def write_content_text(self, outdir, utf=True):
assert(os.path.isdir(outdir))
if utf:
fp = codecs.open(os.path.join(outdir, self.docid + '.txt'), 'w', encoding='utf-8')
fp.write(self.title + '\n' + self.abstract + '\n' + self.body)
else:
fp = open(os.path.join(outdir, self.docid + '.txt'), 'w')
fp.write(unidecode.unidecode(self.title) + '\n' + unidecode.unidecode(self.abstract) + '\n' + unidecode.unidecode(self.body))
fp.close()
#end
class NCBI_Extractor(object):
#qwait = 0.33
qwait = 1.0
maxdoc = 1000
searchURL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
fetchURL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
pmcURL = 'http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3315246/'
def __init__(self):
self.last_qdate = 0
#end
def dispatchRequest(self, q):
# obey NCBI limitations (3 requests per second)
td = time.time() - self.last_qdate
if td < self.qwait:
print 'sleeping for %.2f seconds' % (self.qwait - td)
time.sleep(self.qwait - td)
self.last_qdate = time.time()
return urllib2.urlopen(urllib2.Request(q)).read()
#end
def getIDs(self, queryURL, maxHits=0):
ids = []
cnt = 1
# first batch of results
result = self.dispatchRequest(queryURL)
t = tree.fromstring(result)
ids.extend([x.text for x in t.find('IdList').findall('Id')])
hits = int(t.find('Count').text)
print 'Total hits: ', hits
print 'batch: %d, got: %d' % (cnt, len(ids))
# if we have enough already
if maxHits > 0 and (len(ids) > maxHits or maxHits > hits):
return ids[:maxHits]
# if there are more, get them also with retstart option
while len(ids) < hits:
nq = queryURL + '&retstart=%d&retmax=%d' % (len(ids), self.maxdoc)
result = self.dispatchRequest(nq)
t = tree.fromstring(result)
ids.extend([x.text for x in t.find('IdList').findall('Id')])
cnt += 1
print 'batch: %d, total: %d' % (cnt, len(ids))
if maxHits and len(ids) >= maxHits:
break
#end
if maxHits:
return ids[:maxHits]
else:
return ids
#end
def query(self, queryText, db='pmc', maxHits=0):
if not queryText:
raise ValueError('Empty query!')
query = [('db', db), ('term', queryText)]
query.append(('retmax', self.maxdoc))
query = '%s?%s' % (self.searchURL, urllib.urlencode(query))
ids = self.getIDs(query, maxHits=maxHits)
return ids
#end
def getDocument(self, did, db='pmc'):
xml = self.getXML(did, db)
root = dom.parseString(xml)
doc = self.extractArticleText(root, did)
doc.docid = did
doc.xml = xml
return doc
#end
def getXML(self, did, db='pmc'):
query = [('db', db), ('id', did)]
url = '%s?%s' % (self.fetchURL, urllib.urlencode(query))
xml = self.dispatchRequest(url)
return xml
#end
def getFulltext(self, did):
xml = self.getXML(did)
root = dom.parseString(xml)
doc = self.extractArticleText(root, did)
return doc
def getDocumentFromXMLfile(self, fname, did=None):
#xml = codecs.open(fname, encoding='utf-8').read()
if not did:
did = os.path.splitext(os.path.split(fname)[1])[0]
xml = open(fname).read()
root = dom.parseString(xml)
doc = self.extractArticleText(root, did)
doc.docid = did
doc.xml = xml
return doc
#end
def extractArticleText(self, root, did):
try:
titleNode = root.getElementsByTagName('article-title')[0]
except Exception:
title = ''
print 'Warning: no title found, document %s' % str(did)
else:
title = self.list2text(self.recursiveCollect(titleNode, []))
try:
abstractNode = root.getElementsByTagName('abstract')[0]
except Exception:
abstract = ''
print 'Warning: no abstract found, document %s' % str(did)
else:
abstract = self.list2text(self.recursiveCollect(abstractNode, []))
abstract = re.sub('(\[)[ ,-:;]*(\])', '', abstract) # remove what remains of citations
try:
bodyNode = root.getElementsByTagName('body')[0]
except Exception:
body = ''
print 'Warning: no body found, document %s' % str(did)
else:
body = self.list2text(self.recursiveCollect(bodyNode, []))
body = re.sub('(\[)[ ,-:;]*(\])', '', body)
ytags = root.getElementsByTagName('pub-date')
years = []
for x in ytags:
y = x.getElementsByTagName('year')
if y:
years.append(int(y[0].childNodes[0].data))
year = min(years)
new = Document()
new.year = year
new.title = title
new.abstract = abstract
new.body = body
#new.text = abstract + ' ' + body
return new
#end
#
def recursiveCollect(self, node, result, skipTags=['title', 'xref', 'table', 'graphic', 'ext-link',
'media', 'inline-formula', 'disp-formula']):
for child in node.childNodes:
if child.nodeType == dom.Node.ELEMENT_NODE:
if child.tagName not in skipTags:
self.recursiveCollect(child, result)
elif child.nodeType == dom.Node.TEXT_NODE:
result.append(child.data)
#endfor
return result
#end
def list2text(self, lst):
result = ''
for x in lst:
result += x.strip() + ' '
return result.strip()
#end
#end
########################################
### ALL LEUKEMIA SEARCH
#a = NCBI_Extractor()
##d = a.getDocument(2792210)
#ids = a.query('("t-lymphocytes"[MeSH Terms] OR "t-lymphocytes"[All Fields] OR "t cell"[All Fields] OR "t-cell"[All Fields]) OR ("leukaemia"[All Fields] OR "leukemia"[MeSH Terms] OR "leukemia"[All Fields])',
# maxHits=1001)
#ids = a.query('leukemia', maxHits=10)
#fp = open('ALL-ids.pickle', 'wb')
#cPickle.dump(ids, fp, cPickle.HIGHEST_PROTOCOL)
#fp.close()
########################################
\ No newline at end of file
import cPickle
import pickle
import cStringIO
import StringIO
import gzip
import urllib
# NCBI data ftp.
NCBI_URL = 'http://mirrors.vbi.vt.edu/mirrors/ftp.ncbi.nih.gov/gene/DATA/'
NCBI_ORGANISM_GENE_DATA = 'GENE_INFO/Mammalia/'
# for gene_info file
FORMAT_LINE_START = '#Format:'
ENTREZ_ID = 'GeneID'
GENE_SYMBOL = 'Symbol'
SYNONYMS = 'Synonyms'
NEWENTRY = 'NEWENTRY'
ENTREZ_ID_COLNO = 1
GENE_SYMBOL_COLNO = 2
SYNONYMS_COLNO = 4
# for gene2accession file
tax_id = 0
GeneID = 1
status = 2
RNA_nucleotide_accession = 3
RNA_nucleotide_gi = 4
protein_accession = 5
protein_gi = 6
genomic_nucleotide_accession = 7
EMPTY = '-'
class Mappings(object):
def __init__(self, entrez2symbol={}, symbol2entrez={}, entrez2synonyms={}, synonyms2entrez={}, errors=[]):
self.entrez2symbol = entrez2symbol
self.symbol2entrez = symbol2entrez
self.entrez2synonyms = entrez2synonyms
self.synonyms2entrez = synonyms2entrez
#end
#end class
def getMapping():
entrez2symbol = {}
symbol2entrez = {}
entrez2synonyms = {}
synonyms2entrez = {}
for fname in ['Homo_sapiens.gene_info.gz']: #, 'Mus_musculus.gene_info.gz', 'Rattus_norvegicus.gene_info.gz']:
#for fname in ['gene_info_small']:
#fp = open(fname, 'r')
# Download gzip file.
web_fp = urllib.urlopen('%s%s%s' % (NCBI_URL, NCBI_ORGANISM_GENE_DATA, fname))
fp = gzip.GzipFile(fileobj=StringIO.StringIO(web_fp.read()))
print 'Reading file "%s"' % fname
ln = 0
while True:
ln += 1
if ln%100000 == 0:
print 'at line %d' % ln
line = fp.readline().strip()
if not line: #EOF
break
elts = line.split('\t')
if elts[0].startswith('#'):
continue
## if there is header, check it
#if elts[0] == FORMAT_LINE_START:
#if len(elts) < 3 or elts[ENTREZ_ID_COLNO+1] != ENTREZ_ID or elts[GENE_SYMBOL_COLNO+1] != GENE_SYMBOL or \