1

現在、forloop 内でテキスト ファイルと xlsx ファイル (openxlsx パッケージ) を読み取ることができるコードがあります。私がやろうとしているのは、列を一緒に追加してファイルをxlsxとして出力し、元のxlsxファイルを上書きすることです。各ファイルが上書きされるように、forloop内でそれを行いたいです。

これらすべてのデータフレームを 1 つの巨大なデータフレームにまとめて追加したくはありません。これは、ほとんどの人がこの質問をするときに最も一般的なクエリです。txtfile を xlsfile に追加して xlsx を書き込み、古い xlsx ファイルを上書きしたいだけです。必要に応じて繰り返します。

xls <- dir(pattern = "*xlsx")
txt <- dir(pattern = "*txt")
library("dplyr")
for (xlsfile in xls) {
  D1 <- read.xlsx(xlsfile, colNames = F, rowNames = F)
  for (txtfile in txt) {
    D2 <- read.csv(txtfile, header=F, skip=3, colClasses=c("NULL", rep(NA,3),
      rep("NULL",7), rep(NA,3), rep("NULL",6)
    # this is to keep columns for D2 in line with the columns in D1
    D3 <- bind_rows(D2, D1)
    write.xlsx(D3, xlsfile) 
  }
}

rbind_allD3 を作成するために使用しようとしましたが、Rrbind_allは非推奨であると言いましたか? 最初の問題は、xlsx パッケージを使用して D1 を読み取ると、X1-X6 のヘッダーが取得されることです。これは、短縮された txtfile に D2 の形式で含まれる列の数です。ただし、D2 のヘッダーは V2、V3、V4、V12、V13、V14 です。bind_rows を使用すると、X1-X6 列に追加する代わりに、V2-4 および V12-14 の形式で 6 つの新しい列ヘッダーが作成されるため、新しい情報は xlsx データフレームが終了する場所のすぐ下の 6 列になります。

2 つのファイルの文字が異なる要因であると言われましたが、それがなぜ、どのように起こったのかはわかりません。また、「V」から「X」への単純な gsub が機能するとは思えません。私のD2のV。

ただし、forloop は引き続き bind_rows で実行され、何らかの理由で、古いファイルを上書きして作成した xlsx ファイルを開くと、すべてのファイルに、最後の xlsfile+txtfile と同じ情報と要素が含まれています。 xlsx ファイルはまったく同じで、最後のファイル情報が含まれていましたが、余分な列が残っていました。これは私のコードとは別の問題だと思いますが、最後のファイルを何度も再作成しないようにする方法がわかりません。これは、ある時点で forloop が D3 作成のために終了するが、xlsx ファイルを書き込んでおらず、write.xlsx 関数をループする私のコードが原因であると思います。

R の xlsx (D1) ファイルは次のとおりです。

structure(list(X1 = c("#version 2.4", "## Oncotator v1.5.3.0 | Flat File 
Reference hg19 | GENCODE v19 EFFECT | UniProt_AAxform 2014_12 | CCLE_By_GP 
09292010 | UniProt_AA 2014_12 | dbNSFP v2.4 | ESP 6500SI-V2 | dbSNP build 
142 | 1000Genome phase3 20130502 | ClinVar 12.03.20 | COSMIC v62_291112 | 
ORegAnno UCSC Track | ESP 6500SI-V2 | UniProt 2014_12 | HumanDNARepairGenes 
20110905 | TUMORScape 20100104 | CCLE_By_Gene 09292010 | MutSig Published 
Results 20110905 | COSMIC_Tissue 291112 | CGC full_2012-03-15 | 
COSMIC_FusionGenes v62_291112 | HGNC Sept172014 | ACHILLES_Lineage_Results 
110303 | TCGAScape 110405 | Ensembl ICGC MUCOPA | Familial_Cancer_Genes 
20110905 | gencode_xref_refseq metadata_v19 ", 
"Hugo_Symbol", "MTOR", "EPHA2", "ARID1A"), X2 = c("P3F5", NA, 
"Chromosome", "1", "1", "1"), X3 = c(NA, NA, "Start_position", 
"11307916", "16464389", "27107273"), X4 = c(NA, NA, "End_position", 
"11307916", "16464389", "27107273"), X5 = c(NA, NA, 
"Variant_Classification", 
"Missense_Mutation", "Missense_Mutation", "3'UTR"), X6 = c(NA, 
NA, "Variant_Type", "SNP", "SNP", "SNP"), X7 = c(NA, NA, "Reference_Allele", 
"C", "C", "G"), X8 = c(NA, NA, "Tumor_Seq_Allele2", "T", "T", 
"C"), X9 = c(NA, NA, "dbSNP_RS", NA, NA, "rs199555039"), X10 = c(NA, 
NA, "dbSNP_Val_Status", NA, NA, "byFrequency"), X11 = c(NA, NA, 
"Genome_Change", "g.chr1:11307916C>T", "g.chr1:16464389C>T", 
"g.chr1:27107273G>C"), X12 = c(NA, NA, "Protein_Change", "p.S359N", 
"p.S424N", NA), X13 = c(NA, NA, "COSMIC_overlapping_mutations", 
NA, NA, NA)), .Names = c("X1", "X2", "X3", "X4", "X5", "X6", 
"X7", "X8", "X9", "X10", "X11", "X12", "X13"), row.names = c(NA, 
6L), class = "data.frame")

そして、D2に短縮されたtxtfileは

structure(list(V1 = structure(c(2L, 1L, 31L, 41L), .Label = c("## 
Oncotator v1.5.3.0 | Flat File Reference hg19 | GENCODE v19 EFFECT | 
UniProt_AAxform 2014_12 | CCLE_By_GP 09292010 | UniProt_AA 2014_12 | dbNSFP 
v2.4 | ESP 6500SI-V2 | dbSNP build 142 | 1000Genome phase3 20130502 | 
ClinVar 12.03.20 | COSMIC v62_291112 | ORegAnno UCSC Track | ESP 6500SI-V2 | 
UniProt 2014_12 | HumanDNARepairGenes 20110905 | TUMORScape 20100104 | 
CCLE_By_Gene 09292010 | MutSig Published Results 20110905 | COSMIC_Tissue 
291112 | CGC full_2012-03-15 | COSMIC_FusionGenes v62_291112 | HGNC 
Sept172014 | ACHILLES_Lineage_Results 110303 | TCGAScape 110405 | Ensembl 
ICGC MUCOPA | Familial_Cancer_Genes 20110905 | gencode_xref_refseq 
metadata_v19 ", 
"#version 2.4", "ALK", "APC", "ARID1A", "ATM", "BAP1", "BCOR", 
"C11orf30", "CDH1", "CDKN2B-AS1", "CHD1", "CNOT3", "CSF3R", "CTNNB1", 
"DOK6", "EP300", "EPCAM", "EPHA2", "EPHA7", "EPOR", "ERBB2", 
"ERCC2", "ETV6", "FANCA", "FGF6", "FOXO1", "GPR124", "HDAC4", 
"HNF1A", "Hugo_Symbol", "IKBKE", "KDM5C", "KDR", "KLF4", "KMT2B", 
"KMT2D", "LRP1B", "MAP2K4", "MET", "MTOR", "MYB", "NCOA3", "NF2", 
"NOTCH2", "NSD1", "NTRK2", "NTRK3", "NUTM1", "PARK2", "PAX8", 
"PBRM1", "PDGFRA", "PDGFRB", "PHF8", "PMS1", "PMS2", "POLE", 
"POT1", "PRKCA", "RAD51C", "RAF1", "RB1", "ROBO1", "RUNX1", "SLIT2", 
"SMC1A", "SYK", "SYNE1", "TCF7L2", "TP53", "Unknown", "USP7", 
"WHSC1", "ZMYM3"), class = "factor"), V5 = structure(c(1L, 1L, 
23L, 2L), .Label = c("", "1", "10", "11", "12", "13", "15", "16", 
"17", "18", "19", "2", "20", "21", "22", "3", "4", "5", "6", 
"7", "8", "9", "Chromosome", "X"), class = "factor"), V6 = structure(c(1L, 
1L, 99L, 6L), .Label = c("", "108121722", "110249856", "112174416", 
"112175711", "11307916", "113991570", "114849135", "11488524", 
"116418878", "11958126", "12038916", "120510314", "121426926", 
"123011945", "124481159", "12626167", "12641934", "12642536", 
"12650654", "133240692", "135537281", "141806602", "144003367", 
"149495558", "152646204", "161771035", "16464389", "172516645", 
"176636762", "190728482", "1918738", "20597970", "206666682", 
"22103913", "240003706", "27107273", "29432715", "29445154", 
"29541192", "30038169", "31526940", "34508048", "34646113", "34649335", 
"36176975", "36186234", "36193590", "36210950", "36937111", "37689156", 
"37884417", "39921415", "41162692", "41193074", "41217841", "41223629", 
"41266049", "41564531", "4554585", "45867603", "46264182", "47613623", 
"48955624", "49439908", "51012853", "52442493", "52501682", "52692305", 
"53225043", "53441750", "54008382", "54647625", "55133866", "55970836", 
"56774005", "6029518", "64685656", "64691239", "64695929", "64711567", 
"64723880", "67511099", "68863633", "70465919", "7588995", "76164351", 
"79067483", "87476563", "88663732", "89849429", "89851191", "9017149", 
"92561683", "93657891", "93969298", "98194682", "98209419", "Start_position"
), class = "factor"), V7 = structure(c(1L, 1L, 99L, 6L), .Label = c("", 
"108121722", "110249856", "112174416", "112175711", "11307916", 
"113991570", "114849135", "11488524", "116418878", "11958126", 
"12038916", "120510314", "121426926", "123011945", "124481159", 
"12626167", "12641934", "12642536", "12650654", "133240692", 
"135537281", "141806602", "144003367", "149495558", "152646204", 
"161771035", "16464389", "172516645", "176636762", "190728482", 
"1918738", "20597970", "206666682", "22103913", "240003706", 
"27107273", "29432715", "29445154", "29541192", "30038169", "31526940", 
"34508048", "34646113", "34649335", "36176975", "36186234", "36193590", 
"36210950", "36937111", "37689156", "37884417", "39921415", "41162692", 
"41193074", "41217841", "41223629", "41266049", "41564531", "4554585", 
"45867603", "46264182", "47613623", "48955624", "49439908", "51012853", 
"52442493", "52501682", "52692305", "53225043", "53441750", "54008382", 
"54647625", "55133866", "55970836", "56774005", "6029518", "64685656", 
"64691239", "64695929", "64711567", "64723880", "67511099", "68863633", 
"70465919", "7588995", "76164351", "79067483", "87476563", "88663732", 
"89849429", "89851191", "9017149", "92561683", "93657891", "93969298", 
"98194682", "98209419", "End_position"), class = "factor"), V9 = 
structure(c(1L, 
1L, 9L, 5L), .Label = c("", "3'UTR", "IGR", "Intron", "Missense_Mutation", 
"Nonsense_Mutation", "RNA", "Silent", "Variant_Classification"
), class = "factor"), V10 = structure(c(1L, 1L, 3L, 2L), .Label = c("", 
"SNP", "Variant_Type"), class = "factor"), V11 = structure(c(1L, 
1L, 5L, 3L), .Label = c("", "A", "C", "G", "Reference_Allele", 
"T"), class = "factor"), V13 = structure(c(1L, 1L, 6L, 5L), .Label = c("", 
"A", "C", "G", "T", "Tumor_Seq_Allele2"), class = "factor"), 
V14 = structure(c(1L, 1L, 2L, 1L), .Label = c("", "dbSNP_RS", 
"rs116809738", "rs139387758", "rs141802026", "rs150631563", 
"rs199555039", "rs372456536", "rs550515708", "rs567211112", 
"rs571512044", "rs587782826", "rs79547774"), class = "factor"), 
V15 = structure(c(1L, 1L, 3L, 1L), .Label = c("", "byFrequency", 
"dbSNP_Val_Status"), class = "factor"), V35 = structure(c(1L, 
1L, 99L, 2L), .Label = c("", "g.chr1:11307916C>T", "g.chr1:120510314G>A", 
"g.chr1:16464389C>T", "g.chr1:206666682C>A", "g.chr1:27107273G>C", 
"g.chr1:36937111G>T", "g.chr10:114849135G>A", "g.chr10:123011945G>A", 
"g.chr11:108121722C>T", "g.chr11:76164351T>A", "g.chr12:12038916A>G", 
"g.chr12:121426926A>C", "g.chr12:133240692C>T", "g.chr12:4554585G>A", 
"g.chr12:49439908G>A", "g.chr13:41162692A>G", "g.chr13:41193074G>A", 
"g.chr13:41217841T>C", "g.chr13:41223629G>T", "g.chr13:48955624C>T", 
"g.chr15:34646113G>A", "g.chr15:34649335T>C", "g.chr15:88663732G>A", 
"g.chr16:31526940C>T", "g.chr16:68863633T>A", "g.chr16:89849429G>A", 
"g.chr16:89851191C>T", "g.chr16:9017149C>T", "g.chr17:11958126G>A", 
"g.chr17:37884417C>T", "g.chr17:56774005C>T", "g.chr17:64685656C>T", 
"g.chr17:64691239G>A", "g.chr17:64695929A>T", "g.chr17:64711567C>T", 
"g.chr17:64723880G>A", "g.chr17:7588995C>T", "g.chr18:67511099A>G", 
"g.chr19:11488524C>T", "g.chr19:34508048G>A", "g.chr19:36210950A>G", 
"g.chr19:45867603A>G", "g.chr19:54647625C>T", "g.chr2:113991570C>T", 
"g.chr2:141806602C>T", "g.chr2:190728482G>A", "g.chr2:240003706G>A", 
"g.chr2:29432715G>A", "g.chr2:29445154C>T", "g.chr2:29541192G>T", 
"g.chr2:47613623A>C", "g.chr20:46264182A>C", "g.chr20:51012853G>A", 
"g.chr20:52501682G>A", "g.chr21:36176975G>A", "g.chr21:36186234T>G", 
"g.chr21:36193590A>G", "g.chr22:30038169T>C", "g.chr22:41564531C>T", 
"g.chr3:12626167G>A", "g.chr3:12641934C>T", "g.chr3:12642536G>T", 
"g.chr3:12650654C>T", "g.chr3:41266049C>G", "g.chr3:52442493G>A", 
"g.chr3:52692305G>A", "g.chr3:79067483T>C", "g.chr4:172516645G>A", 
"g.chr4:1918738G>A", "g.chr4:20597970G>A", "g.chr4:55133866T>C", 
"g.chr4:55970836C>T", "g.chr5:112174416G>T", "g.chr5:112175711G>A", 
"g.chr5:149495558A>G", "g.chr5:176636762T>C", "g.chr5:98194682A>G", 
"g.chr5:98209419G>A", "g.chr6:135537281C>T", "g.chr6:152646204G>A", 
"g.chr6:161771035C>T", "g.chr6:93969298C>G", "g.chr7:116418878T>A", 
"g.chr7:124481159G>T", "g.chr7:6029518C>T", "g.chr7:92561683G>A", 
"g.chr8:144003367A>G", "g.chr8:37689156T>A", "g.chr9:110249856C>T", 
"g.chr9:22103913G>A", "g.chr9:87476563G>A", "g.chr9:93657891G>A", 
"g.chrX:39921415G>A", "g.chrX:53225043G>A", "g.chrX:53441750T>C", 
"g.chrX:54008382T>C", "g.chrX:70465919G>A", "Genome_Change"
), class = "factor"), V42 = structure(c(1L, 1L, 40L, 34L), .Label = c("", 
"p.A1150V", "p.A1474T", "p.A301T", "p.A353T", "p.A403D", 
"p.A510A", "p.A624T", "p.C654Y", "p.D1014D", "p.D454D", "p.G581D", 
"p.H84H", "p.I1148N", "p.I185I", "p.K123R", "p.K234R", "p.K403K", 
"p.L360P", "p.L413I", "p.L518L", "p.L791H", "p.L868L", "p.M102I", 
"p.P16A", "p.P542Q", "p.P672P", "p.P868S", "p.Q1545*", "p.Q410P", 
"p.R1469W", "p.S1042I", "p.S1318L", "p.S359N", "p.S424N", 
"p.S51L", "p.T1258I", "p.T273T", "p.V1521A", "Protein_Change"
), class = "factor"), V57 = structure(c(1L, 1L, 2L, 1L), .Label = c("", 
"COSMIC_overlapping_mutations", "p.?(1)", "p.?(2)", "p.0?(10)|p.?(1)", 
"p.0?(15)|p.?(4)", "p.0?(2)|p.?(1)", "p.A5_A80del(53)|p.A5_Q143del(7)|p.A5_A80>D(7)|p.?(4)|p.T3_A126del(2)|p.A5fs*7(2)|p.M5_N141>D(2)|p.L10_N141del(2)|p.A5_Y142>D(2)|p.A5_Q143>E(1)|p.A13_R151del(1)|p.D6_S29del(1)|p.M14_S45del(1)|p.M1_A87del(1)|p.D11_Y142>H(1)|p.A5_T59del(1)|p.M1_V173del(1)|p.E15_I140>V(1)|p.A5_T40del(1)|p.A5_E54del(1)|p.M8_A80del(1)|p.P16_K133del(1)|p.A5_I35del(1)", 
"p.K1454fs*3(1)|p.K1192fs*3(1)|p.?(1)", "p.P542R(1)"), class = "factor")), .Names = c("V1", 
"V5", "V6", "V7", "V9", "V10", "V11", "V13", "V14", "V15", "V35", 
"V42", "V57"), row.names = c(NA, 4L), class = "data.frame")

3行目はすべて同じなので、両方を読むときに数行をスキップして簡単に追加できるように言うかもしれませんが、最初の2つのヘッダーを保持する必要がありますが、その方法は私だけが知っていますやっている

FileName = file.choose()
input = file(FileName, open = "r")
Header = readLines(input, 2)
Data <- read.csv(input)
close(input)

output = file(FileName, open = "w")
writeLines(Header, output)
write.csv(Data, output, row.names = F)
close(output)

そして、forloopでこれを行う方法がわかりません

4

0 に答える 0