-1

タンパク質とその相互作用物質のリストがあり、さまざまなタンパク質間で共有される相互作用物質の割合を知りたいと思っています。

タンパク質と相互作用物質の私のリストは次のようになります。

head(lista)
$`A1CF `
[1] " A1CF"    " APOBEC1" " CUGBP2"  " KHSRP"   " SYNCRIP" " TNPO2"  

$`A2LD1 `
[1] " A2LD1"   " PRPSAP2" " RPL15"   " TANC1"  

$`A2M `
[1] " A2M"      " ADAM19"   " ADAMTS1"  " AMBP"     " ANXA6"    " APOE"     " APP"      "    B2M"      " C11orf58" " CELA1"    " CPB2"     " CTSB"     " CTSE"    
[14] " F2"       " HSPA5"    " IL10"     " IL1B"     " KLK13"    " KLK2"     " KLK3"     " KLK5"     " KLKB1"    " LCAT"     " LEP"      " LRP1"     " MMP2"    
[27] " MYOC"     " NGF"      " PAEP"     " PDGFA"    " PDGFB"    " PLG"      " SERPINA1" "  SHBG"     " SPACA3"   " TGFBI"   

$`AAAS `
[1] " AAAS"    " ARHGAP1" " BANF1"   " CCNG2"   " EP300"   " HMGA1"   " KPNB1"   " NUP107"  " NUP133"  " NUP153"  " NUP155"  " NUP160"  " NUP188"  " NUP205" 
[15] " NUP210"  " NUP214"  " NUP35"   " NUP37"   " NUP43"   " NUP50"   " NUP54"   " NUP62"   " NUP85"   " NUP88"   " NUP93"   " NUP98"   " NUPL1"   " NUPL2"  
[29] " PLK4"    " POM121C" " PSIP1"   " RAE1"    " RAN"     " RANBP2"  " SEH1L"   " TARDBP"  " TPR"     " TTK"     " XPO1"   

$`AAGAB `
[1] " AAGAB"  " AFTPH"  " EIF3C"  " UNC119"

$`AAK1 `
[1] " AAK1"     " ACOX3"    " ADAM28"   " ALPK3"    " AURKB"    " AZI2"     " BMP2K"    " CABC1"    " CAMK2G"   " DCK"      " DCTPP1"   " EIF2AK1"  " FAM83A"  
[14] " FER"      " FRYL"     " GAPVD1"   " GFPT1"    " HIPK1"    " JAK1"     " KIAA0195" " KIAA0528" " LIMK2"    " LSM14A"   " MAP4K2"   " MAP4K5"   " MAPK6"   
[27] " NEK11"    " NQO2"     " NUMB"     " PDE4A"    " PIP4K2C"  " PKN3"     " PRKAA1"   " PTPN18"   " SIK2"     " SIK3"     " SPEG"     " TAOK1"    " TAOK3"   
[40] " TBK1"     " TBKBP1"   " TESK2"    " TMX1"     " TNK1"     " ZAK" 

タンパク質間の共有インタラクターの割合を取得するために、次のことを行いました。

長さに等しい次元の行列を作成しましたlista

M=matrix();
length(M) = 9794^2;
dim(M) = c(9794, 9794);

#A function to calculate the interactors shared among proteins
dFun3 <- function(x,y){length(which(x%in%y))/length(x)};

#To create a matrix with percentage of intereactors shared among proteins (note that the matrix is non-symmentric, being AxB different from BxA, with A and B being proteins)

for (i in 1:length(lista))
{
    for (j in 1:length(lista))
    {
        k = dFun3(lista[[i]], lista[[j]])
        M[i,j] = k;
    }
}

AxBこれで、 との比較を表示するマトリックスができましたBxA。私が今やりたいことは、タンパク質 i の値とタンパク質 j の値を比較することです。アイデアは、AxBvsBxAと ifを比較して、A タンパク質AxB is > 0.7BxA < 0.7削除することです。私のアプローチは、次のような for ループを作成することです。

for (i in 1:nrow(M))
{
    for (j in 1:ncol(M))
    {
        if (x[i,] > 0.7 & x[,j] < 0.7) {x[i,] <- "-1"}
        if (x[,j] > 0.7 & x[i,] <0.7) {x[,j] <- "+1"}
    }
}

この方法では、+1 と -1 の比較でタンパク質を削除するふりをします。

それにもかかわらず、このアプローチには長い時間がかかります...どんな提案でも大歓迎です。

ありがとう

4

1 に答える 1

2

そのcombn+intersectは良い候補のようです。たとえば、これを試してください:

combn(seq_along(lista),2,function(x)
         length(intersect(lista[[x[1]]],lista[[x[2]]]))/length(lista[[x[1]]]))

[1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0  ## gives all zeros here since 
                                   ## no intersection in your example

実際、combnは可能なすべてのインデックスの組み合わせを生成し、それらをインデックスのペアとして関数に渡して交差をテストします。

combn(seq_along(lista),2)
     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14] [,15]
[1,]    1    1    1    1    1    2    2    2    2     3     3     3     4     4     5
[2,]    2    3    4    5    6    3    4    5    6     4     5     6     5     6     6

OPは再現可能な例を提供しないため、編集してlistaのdputを追加します。

dput(lista)
structure(list(A1CF = c(" A1CF", " AURKB", " CUGBP2", " KHSRP", 
" SYNCRIP", " TNPO2"), A2LD1 = c(" A2LD1", " PRPSAP2", " RPL15", 
" TANC1"), A2M = c(" A2M", " ADAM19", " ADAMTS1", " AMBP", " ANXA6", 
" APOE", " APP", ",B2M", " C11orf58", " CELA1", " CPB2", " CTSB", 
" CTSE", " F2", " HSPA5", " IL10", " IL1B", " KLK13", " KLK2", 
" KLK3", " KLK5", " KLKB1", " LCAT", " LEP", " LRP1", " MMP2", 
" MYOC", " NGF", " PAEP", " PDGFA", " PDGFB", " PLG", " SERPINA1", 
"  SHBG", " SPACA3", " TGFBI"), AAAS = c(" AAAS", " ARHGAP1", 
" BANF1", " CCNG2", " EP300", " HMGA1", " KPNB1", " NUP107", 
" NUP133", " NUP153", " NUP155", " NUP160", " NUP188", " NUP205", 
" NUP210", " NUP214", " NUP35", " NUP37", " NUP43", " NUP50", 
" NUP54", " NUP62", " NUP85", " NUP88", " NUP93", " NUP98", " NUPL1", 
" NUPL2", " PLK4", " POM121C", " PSIP1", " RAE1", " RAN", " RANBP2", 
" SEH1L", " TARDBP", " TPR", " TTK", " XPO1"), AAGAB = c(" AAGAB", 
" AFTPH", " EIF3C", " UNC119"), AAK1 = c(" AAK1", " ACOX3", " ADAM28", 
" ALPK3", " AURKB", " AZI2", " BMP2K", " CABC1", " CAMK2G", " DCK", 
" DCTPP1", " EIF2AK1", " FAM83A", " FER", " FRYL", " GAPVD1", 
" GFPT1", " HIPK1", " JAK1", " KIAA0195", " KIAA0528", " LIMK2", 
" LSM14A", " MAP4K2", " MAP4K5", " MAPK6", " NEK11", " NQO2", 
" NUMB", " PDE4A", " PIP4K2C", " PKN3", " PRKAA1", " PTPN18", 
" SIK2", " SIK3", " SPEG", " TAOK1", " TAOK3", " TBK1", " TBKBP1", 
" TESK2", " TMX1", " TNK1", " ZAK")), .Names = c("A1CF", "A2LD1", 
"A2M", "AAAS", "AAGAB", "AAK1"))

編集

行列の行 2 と行 1 の比較を探すために、関数を次のように変更できます。

ll <- combn(seq_along(lista),2,FUN=function(x){
  ratio <- length(intersect(lista[[x[1]]],lista[[x[2]]]))/
        c(length(lista[[x[1]]]),length(lista[[x[2]]]))
  res <- NA                             ## value to return by default
  if (ratio[1] > 0.7 & ratio[2] < 0.7) 
      res <- x[[1]]                     ## return the index of the first protein
  if (ratio[2] > 0.7 & ratio[1] < 0.7) 
      res <- x[[2]]                     ## return the index of the second protein
  res
})
## to get the list of proteins to removed
names(lista)[ll[!is.na(ll)]]
## to remove the proteins form the origin list 
lista[!names(lista) %in% names(lista)[ll[!is.na(ll)]]] 

たぶん、llリストから重複を削除する必要があります。

参考までに47956321 = choose(9794,2)組み合わせの数....

于 2013-06-06T04:38:04.607 に答える