Création d'un compte pour un collaborateur extérieur au laboratoire depuis l'intranet ICube : https://intranet.icube.unistra.fr/fr/labs/member/profile

Commit 3cbea1e5 authored by vpodpecan's avatar vpodpecan
Browse files

new functions and data for bio3graph

parent b7de1379
import urllib
import urllib2
import csv
import cStringIO
import os
import cPickle
import time
from os.path import join, normpath, dirname
# http://gpsdb.expasy.org/cgi-bin/gpsdb/show?name=interleukin%208&model=Homo%20sapiens&type=gene&format=txt
# gpsdb.expasy.org/cgi-bin/gpsdb/show?name=indy&format=txt
#http://gpsdb.expasy.org/cgi-bin/gpsdb/show?model=Homo+sapiens&type=gene&name=interleukin+8&format=txt
GPSDB_URL = 'http://gpsdb.expasy.org/cgi-bin/gpsdb/show?'
NAME = 'name'
MODEL = 'model'
TYPE = 'type'
FORMAT = 'format'
MODEL_HUMAN = 'Homo sapiens'
TYPE_GENE = 'gene'
TXT_FORMAT = 'txt'
#entrez2symbol = cPickle.load(open('mappings/e2symb'))
#entrez2synonyms = cPickle.load(open('mappings/e2syns'))
class Synonym_extractor(object):
qwait = 0.5
cache_fname = 'data/gene_synonyms_cache'
def __init__(self):
self.last_qdate = 0
try:
self._cache = cPickle.load(open(normpath(join(dirname(__file__), self.cache_fname))))
except Exception:
self._cache = {}
#end
def _flush_cache(self):
#fp = open(self.cache_fname, 'wb')
fp = open(normpath(join(dirname(__file__), self.cache_fname)), 'wb')
cPickle.dump(self._cache, fp)
fp.close()
#end
def __del__(self):
self._flush_cache()
def wait(self):
# 2 requests per second (for now)
td = time.time() - self.last_qdate
if td < self.qwait:
print 'sleeping for %.2f seconds' % (self.qwait - td)
time.sleep(self.qwait - td)
#end
def make_query_url(self, geneSymbol):
params = {NAME: geneSymbol, MODEL: MODEL_HUMAN, TYPE: TYPE_GENE, FORMAT: TXT_FORMAT}
qstring = GPSDB_URL + urllib.urlencode(params)
return qstring
#end
def get_gene_synonyms(self, geneSymbol):
if geneSymbol in self._cache:
return self._cache[geneSymbol]
self.wait()
qstring = self.make_query_url(geneSymbol)
try:
fs = cStringIO.StringIO(urllib2.urlopen(urllib2.Request(qstring)).read())
#print fs.getvalue()
table = csv.reader(fs, delimiter='\t')
except Exception, e:
print e.message
return []
synonyms = [row[0] for row in table]
synonyms = [x.lower() for x in synonyms]
synonyms = list(set(synonyms))
self._cache.update({geneSymbol: synonyms})
return synonyms
#end
def get_geneset_synonyms(self, genes):
assert(isinstance(genes, list))
result = {}
for gene in genes:
result[gene] = self.get_gene_synonyms(gene)
return result
#end
#end class
#res['A']['gsea', 'fisher', 'all', 'page'][1, 2, 3,...]['topGenes', 'terms', 'allGenes', 'scores']
# ['z_score', 'page_p', 'gsea_p', 'enrichment_score', 'unadjusted_p', 'fisher_p', 'aggregate_p']
# def get_synonyms_for_results(resultsFile):
# assert(os.path.isfile(resultsFile))
#
# res = cPickle.load(open(resultsFile))
# geneSets = {}
# for ruleNum in res['A']['all'].keys():
# rule = res['A']['all'][ruleNum]
# geneSets[ruleNum] = rule['topGenes']
#
#
# synonyms = {}
# se = Synonym_extractor()
# for ruleNum in geneSets.keys():
# synonyms[ruleNum] = se.get_geneset_synonyms(geneSets[ruleNum])
#
# fp = open('result_genes2syns.pickle', 'wb')
# cPickle.dump(synonyms, fp, cPickle.HIGHEST_PROTOCOL)
# fp.close()
# print 'written rule %d, genes: %d' % (ruleNum, len(synonyms[ruleNum]))
# return synonyms
# #end
#
#
# def synMapping():
# res_syns = cPickle.load(open('data/result_genes2syns.pickle'))
# syn_map = {}
# for ruleNum in res_syns.keys():
# for egid in res_syns[ruleNum].keys():
# if egid not in syn_map:
# syn_map[egid] = res_syns[ruleNum][egid]
# return syn_map
#
# #a = synMapping()
# #fp = open('data/gene2syn', 'wb')
# #cPickle.dump(a, fp, cPickle.HIGHEST_PROTOCOL)
# #fp.close()
#e2syns = cPickle.load(open('mappings/e2syns'))
#result_syns = cPickle.load(open('data/result_genes2syns.pickle'))
#for ruleNum in result_syns.keys():
#for egid in result_syns[ruleNum].keys():
#gpsdb = result_syns[ruleNum][egid]
#ncbi = list(set(e2syns[egid]))
#joined = list(set(gpsdb).union(set(ncbi)))
#diff = set(ncbi).difference(gpsdb)
##print ncbi
#if diff:
#print diff
#print 'gpsdb: %d, ncbi: %d, total: %d' %(len(gpsdb), len(ncbi), len(joined))
############
# a = Synonym_extractor()
# s = a.get_geneset_synonyms(['lgals3', 'sfpq', 'ddx39b', 'srsf11', 'cir1', 'luc7l3', 'prpf39', 'hnrnph1', 'lgals3', 'sfpq', 'ddx39b'])
##a = get_synonyms_for_results('data/ALL-FINAL.pickle')
import urllib2
import urllib
import os
from xml.etree import cElementTree as tree
import time
import pickle
import re
import xml.dom.minidom as dom
import cPickle
import csv
import codecs
import unidecode
class Document(object):
def __init__(self):
self.docid = None
self.year = None
self.title = None
self.abstract = None
self.body = None
#self.text = None
self.xml = None
#end
def write_content_text(self, outdir, utf=True):
assert(os.path.isdir(outdir))
if utf:
fp = codecs.open(os.path.join(outdir, self.docid + '.txt'), 'w', encoding='utf-8')
fp.write(self.title + '\n' + self.abstract + '\n' + self.body)
else:
fp = open(os.path.join(outdir, self.docid + '.txt'), 'w')
fp.write(unidecode.unidecode(self.title) + '\n' + unidecode.unidecode(self.abstract) + '\n' + unidecode.unidecode(self.body))
fp.close()
#end
class NCBI_Extractor(object):
#qwait = 0.33
qwait = 1.0
maxdoc = 1000
searchURL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
fetchURL = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
pmcURL = 'http://www.ncbi.nlm.nih.gov/pmc/articles/PMC3315246/'
def __init__(self):
self.last_qdate = 0
#end
def dispatchRequest(self, q):
# obey NCBI limitations (3 requests per second)
td = time.time() - self.last_qdate
if td < self.qwait:
print 'sleeping for %.2f seconds' % (self.qwait - td)
time.sleep(self.qwait - td)
self.last_qdate = time.time()
return urllib2.urlopen(urllib2.Request(q)).read()
#end
def getIDs(self, queryURL, maxHits=0):
ids = []
cnt = 1
# first batch of results
result = self.dispatchRequest(queryURL)
t = tree.fromstring(result)
ids.extend([x.text for x in t.find('IdList').findall('Id')])
hits = int(t.find('Count').text)
print 'Total hits: ', hits
print 'batch: %d, got: %d' % (cnt, len(ids))
# if we have enough already
if maxHits > 0 and (len(ids) > maxHits or maxHits > hits):
return ids[:maxHits]
# if there are more, get them also with retstart option
while len(ids) < hits:
nq = queryURL + '&retstart=%d&retmax=%d' % (len(ids), self.maxdoc)
result = self.dispatchRequest(nq)
t = tree.fromstring(result)
ids.extend([x.text for x in t.find('IdList').findall('Id')])
cnt += 1
print 'batch: %d, total: %d' % (cnt, len(ids))
if maxHits and len(ids) >= maxHits:
break
#end
if maxHits:
return ids[:maxHits]
else:
return ids
#end
def query(self, queryText, db='pmc', maxHits=0):
if not queryText:
raise ValueError('Empty query!')
query = [('db', db), ('term', queryText)]
query.append(('retmax', self.maxdoc))
query = '%s?%s' % (self.searchURL, urllib.urlencode(query))
ids = self.getIDs(query, maxHits=maxHits)
return ids
#end
def getDocument(self, did, db='pmc'):
xml = self.getXML(did, db)
root = dom.parseString(xml)
doc = self.extractArticleText(root, did)
doc.docid = did
doc.xml = xml
return doc
#end
def getXML(self, did, db='pmc'):
query = [('db', db), ('id', did)]
url = '%s?%s' % (self.fetchURL, urllib.urlencode(query))
xml = self.dispatchRequest(url)
return xml
#end
def getFulltext(self, did):
xml = self.getXML(did)
root = dom.parseString(xml)
doc = self.extractArticleText(root, did)
return doc
def getDocumentFromXMLfile(self, fname, did=None):
#xml = codecs.open(fname, encoding='utf-8').read()
if not did:
did = os.path.splitext(os.path.split(fname)[1])[0]
xml = open(fname).read()
root = dom.parseString(xml)
doc = self.extractArticleText(root, did)
doc.docid = did
doc.xml = xml
return doc
#end
def extractArticleText(self, root, did):
try:
titleNode = root.getElementsByTagName('article-title')[0]
except Exception:
title = ''
print 'Warning: no title found, document %s' % str(did)
else:
title = self.list2text(self.recursiveCollect(titleNode, []))
try:
abstractNode = root.getElementsByTagName('abstract')[0]
except Exception:
abstract = ''
print 'Warning: no abstract found, document %s' % str(did)
else:
abstract = self.list2text(self.recursiveCollect(abstractNode, []))
abstract = re.sub('(\[)[ ,-:;]*(\])', '', abstract) # remove what remains of citations
try:
bodyNode = root.getElementsByTagName('body')[0]
except Exception:
body = ''
print 'Warning: no body found, document %s' % str(did)
else:
body = self.list2text(self.recursiveCollect(bodyNode, []))
body = re.sub('(\[)[ ,-:;]*(\])', '', body)
ytags = root.getElementsByTagName('pub-date')
years = []
for x in ytags:
y = x.getElementsByTagName('year')
if y:
years.append(int(y[0].childNodes[0].data))
year = min(years)
new = Document()
new.year = year
new.title = title
new.abstract = abstract
new.body = body
#new.text = abstract + ' ' + body
return new
#end
#
def recursiveCollect(self, node, result, skipTags=['title', 'xref', 'table', 'graphic', 'ext-link',
'media', 'inline-formula', 'disp-formula']):
for child in node.childNodes:
if child.nodeType == dom.Node.ELEMENT_NODE:
if child.tagName not in skipTags:
self.recursiveCollect(child, result)
elif child.nodeType == dom.Node.TEXT_NODE:
result.append(child.data)
#endfor
return result
#end
def list2text(self, lst):
result = ''
for x in lst:
result += x.strip() + ' '
return result.strip()
#end
#end
########################################
### ALL LEUKEMIA SEARCH
#a = NCBI_Extractor()
##d = a.getDocument(2792210)
#ids = a.query('("t-lymphocytes"[MeSH Terms] OR "t-lymphocytes"[All Fields] OR "t cell"[All Fields] OR "t-cell"[All Fields]) OR ("leukaemia"[All Fields] OR "leukemia"[MeSH Terms] OR "leukemia"[All Fields])',
# maxHits=1001)
#ids = a.query('leukemia', maxHits=10)
#fp = open('ALL-ids.pickle', 'wb')
#cPickle.dump(ids, fp, cPickle.HIGHEST_PROTOCOL)
#fp.close()
########################################
\ No newline at end of file
import cPickle
import pickle
import cStringIO
import StringIO
import gzip
import urllib
# NCBI data ftp.
NCBI_URL = 'http://mirrors.vbi.vt.edu/mirrors/ftp.ncbi.nih.gov/gene/DATA/'
NCBI_ORGANISM_GENE_DATA = 'GENE_INFO/Mammalia/'
# for gene_info file
FORMAT_LINE_START = '#Format:'
ENTREZ_ID = 'GeneID'
GENE_SYMBOL = 'Symbol'
SYNONYMS = 'Synonyms'
NEWENTRY = 'NEWENTRY'
ENTREZ_ID_COLNO = 1
GENE_SYMBOL_COLNO = 2
SYNONYMS_COLNO = 4
# for gene2accession file
tax_id = 0
GeneID = 1
status = 2
RNA_nucleotide_accession = 3
RNA_nucleotide_gi = 4
protein_accession = 5
protein_gi = 6
genomic_nucleotide_accession = 7
EMPTY = '-'
class Mappings(object):
def __init__(self, entrez2symbol={}, symbol2entrez={}, entrez2synonyms={}, synonyms2entrez={}, errors=[]):
self.entrez2symbol = entrez2symbol
self.symbol2entrez = symbol2entrez
self.entrez2synonyms = entrez2synonyms
self.synonyms2entrez = synonyms2entrez
#end
#end class
def getMapping():
entrez2symbol = {}
symbol2entrez = {}
entrez2synonyms = {}
synonyms2entrez = {}
for fname in ['Homo_sapiens.gene_info.gz']: #, 'Mus_musculus.gene_info.gz', 'Rattus_norvegicus.gene_info.gz']:
#for fname in ['gene_info_small']:
#fp = open(fname, 'r')
# Download gzip file.
web_fp = urllib.urlopen('%s%s%s' % (NCBI_URL, NCBI_ORGANISM_GENE_DATA, fname))
fp = gzip.GzipFile(fileobj=StringIO.StringIO(web_fp.read()))
print 'Reading file "%s"' % fname
ln = 0
while True:
ln += 1
if ln%100000 == 0:
print 'at line %d' % ln
line = fp.readline().strip()
if not line: #EOF
break
elts = line.split('\t')
if elts[0].startswith('#'):
continue
## if there is header, check it
#if elts[0] == FORMAT_LINE_START:
#if len(elts) < 3 or elts[ENTREZ_ID_COLNO+1] != ENTREZ_ID or elts[GENE_SYMBOL_COLNO+1] != GENE_SYMBOL or \
#elts[SYNONYMS_COLNO+1] != SYNONYMS:
#raise TypeError('Input file is not an Entrez gene info file.')
#continue
entid = int(elts[ENTREZ_ID_COLNO])
if entid in entrez2symbol or entid in entrez2synonyms:
print 'duplicate GeneID (Entrez ID): %d' % entid
continue
# get gene symbol
symbol = elts[GENE_SYMBOL_COLNO].lower()
if symbol == NEWENTRY:
continue
# we ignore symbols which are not unique
if symbol in symbol2entrez:
print 'Not unique: ', symbol, entid
eid = symbol2entrez[symbol]
del entrez2symbol[eid]
del symbol2entrez[symbol]
continue
entrez2symbol[entid] = symbol
symbol2entrez[symbol] = entid
# get gene synonyms
syns = [s.strip().lower() for s in elts[SYNONYMS_COLNO].split('|')]
syns = [s for s in syns if s!=EMPTY]
# we ignore synonyns which are not unique
entrez2synonyms[entid] = []
for s in syns:
if s in synonyms2entrez:
e = synonyms2entrez[s]
entrez2synonyms[e].remove(s)
del synonyms2entrez[s]
else:
synonyms2entrez[s] = entid
entrez2synonyms[entid].append(s)
#end
#end while
fp.close()
print 'Done.'
#end for
print len(synonyms2entrez), len(entrez2synonyms)
return Mappings(entrez2symbol=entrez2symbol, symbol2entrez=symbol2entrez, entrez2synonyms=entrez2synonyms,\
synonyms2entrez=synonyms2entrez)
#end
def mappingModuleAll(a):
#a = cPickle.load(open('mappingsHMR.pickle'))
assert(isinstance(a, Mappings))
ofp = open('mappings/mappingsFull.py', 'w')
ofp.write('from pickle import load\n')
ofp.write('from cStringIO import StringIO\n\n')
for attr in ['symbol2entrez', 'synonyms2entrez', 'entrez2symbol', 'entrez2synonyms']:
outS = cStringIO.StringIO()
d = getattr(a, attr)
pickle.dump(d, outS)
outS.flush()
#ofp.write("%s = load(StringIO('''%s'''))\n" % (attr, b64encode(outS.getvalue())))
ofp.write("%s = load(StringIO('''%s'''))\n" % (attr, outS.getvalue()))
ofp.close()
#end
mapp = getMapping()
fp = open('entrez2symbol.pickle', 'wb')
cPickle.dump(mapp.entrez2symbol, fp, cPickle.HIGHEST_PROTOCOL)
import urllib2
import cPickle
from os.path import join, abspath, normpath, dirname
FILE_LIST_URL = 'ftp://ftp.ncbi.nlm.nih.gov/pub/pmc/file_list.txt'
dname = dirname(__file__)
mapping = {}
fp = urllib2.urlopen(FILE_LIST_URL)
lines = fp.readlines()
for line in lines:
elts = line.split('\t')
if len(elts) != 3:
continue
pmid = elts[2].strip().replace('PMC', '')
mapping[pmid] = None
fp = open(normpath(join(dname, 'OA_dict.pickle')), 'w')
cPickle.dump(mapping, fp, cPickle.HIGHEST_PROTOCOL)
fp.close()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment