找出人类基因长度
library(rtracklayer)
BiocManager::install("rtracklayer")
gtf <- import("Homo_sapiens.GRCh38.112.gtf")
genes<-gtf[gtf$type=="gene"]
genes$gene_length<-width(genes)
write.csv(as.data.frame(genes),"genes.csv")
b<-read.csv("BRACgene.csv",header=TRUE)
a<-read.csv("B.csv",header=TRUE)
c<-merge(a,b,by="Gene")
write.csv(c,"MERGE.csv")
把有后缀的基因处理一下
library(dplyr)
View(total_rpk)
View(tpm)
setwd("D:/work1/task/09免疫治疗响应预测/0929处理数据/SKCM/1008")
setwd("D:/Downloads/google")
exp<-read.csv("BRAC1.csv",header=TRUE)
exp[,1]<-gsub("\\..*", "", exp[,1]) #删除后缀
write.csv(exp2, "3.csv")
#每个基因名只出现一次,并且对应的值是原数据框中相同基因名的所有值的均值
exp1<-exp %>% group_by(Gene) %>% summarise(across(everything(),mean,na.rm=TRUE))
#匹配人类基因长度表格
gene_human<-read.csv("genes.csv",header=TRUE)
exp2<-merge(gene_human,exp1,by="Gene")
gene_length<-c(exp2[,2])
gene_length_kb<-gene_length/1000
exp0<-exp2[,-2]
exp0<-exp0[!duplicated(exp0),]
row.names(exp0)<-exp0[,1]
exp0<-exp0[,-1]
exp0<-2^exp0
rpk<-exp0/gene_length_kb #每个基因的RPK值
total_rpk<-colSums(rpk,na.rm = TRUE) #计算每个样板的总RPK
tpm<-sweep(rpk,2,total_rpk,"/")*1e6
#每个基因的TPM值:对每一列(每个样本)除以总RPK值,并乘以 1,000,000,得到TPM值。
write.csv(tpm,"BRAC_TPM.CSV")