require(lattice) require(fields) # error <- function(pred, ref) { e <- abs(pred-ref) e } error.rel <- function(pred, ref, sym=TRUE) { e <- pred/ref if(sym & (e < 1)) e <- 1/e e } quantify.toppep.toptrans.prots.truncated.ranking <- function(lfqtab, prots, toppeps, toptrans, area=TRUE, log=FALSE, repl=0) { qs <- as.vector(sapply(prots, function(prot) quantify.toppep.toptrans.truncated.ranking(lfqtab=lfqtab, prot=prot, toppeps=toppeps, toptrans=toptrans, area=area, log=log, repl=repl))) qs } quantify.toppep.toptrans.truncated.ranking <- function(lfqtab, prot, toppeps, toptrans, area=TRUE, log=FALSE, repl=0) { # lfqtab is supposed to be a table with cols # [1] "PrecursorMz" "ProductMz" "CollisionEnergy" # [4] "PeptideSequence" "ProteinName" "PrecursorCharge" # [7] "FragmentIon" "ProductCharge" "Area.Triplicate.1" # [10] "Area.Triplicate.2" "Area.Triplicate.3" "Height.Triplocate.1" # [13] "Height.Triplocate.2" "Height.Triplocate.3" q <- 0 # get subtable with entries for protein prot prottab <- lfqtab[which(as.vector(lfqtab[,"ProteinName"]) == prot),] # check if protein prot is reported in lfqtab if(nrow(prottab) == 0) {print(paste("Did not find protein", prot, "in quantification table")); return(q)} # quantification cols start at 9 for area, or at 12 for peak height qcoloffset <- 8 if(area == FALSE) qcoloffset <- qcoloffset + 3 considered.repl.cols <- (qcoloffset+1):(qcoloffset+3) if(repl > 0 && repl < 4) considered.repl.cols <- qcoloffset+repl # peptides pointing to prot peps <- unique(as.vector(prottab[,"PeptideSequence"])) # print(peps) # total peptide quantities -- estimated by the sum of the first topt transitions pep.tot.quants <- sapply(peps, function(pep) {sum(head(sort(sapply(which(as.vector(prottab[,"PeptideSequence"]) == pep), function(prow) {mean(sapply(considered.repl.cols, function(qcol) prottab[prow,qcol]), na.rm=TRUE)}), decreasing=TRUE ), toptrans))}) # sort in descending order peps.order <- order(pep.tot.quants, decreasing=TRUE) peps.sorted <- peps[peps.order] pep.tot.quants.sorted <- pep.tot.quants[peps.order] # considered peptides considered.peps <- head(peps.sorted, toppeps) considered.pep.tot.quants <- head(pep.tot.quants.sorted, toppeps) # determine considered transitions for considered peptides # determine labelfree quantitation estimates by peptide lfq <- sapply(considered.peps, function(pep) {sum(head(sort(sapply(which(as.vector(prottab[,"PeptideSequence"]) == pep), function(prow) {mean(sapply(considered.repl.cols, function(qcol) prottab[prow,qcol]), na.rm=TRUE)}), decreasing=TRUE), toptrans))}) # determine labelfree estimate for protein q <- sum(lfq) if(log==TRUE) q <- log(q) q } # model selection for label free quantification with srm # parameters # # lfqfilename filename for mrmprophet file with transition intensity information # aquaqfilename filename for reference (e.g. aqua) protein abundances # area.flag flag indicating whether transition intensities are estimated from area (or height otherwise) # outprefix prefix for output files # bootstrap.size number of repetitions of monte carlo cv # bootstrap.subfrac relative size of training fold in each monte carlo cv repeat lf_srm_quant <- function(lfqfilename, aquaqfilename, area.flag, outprefix, bootstrap.size=1000, bootstrap.subfrac=2/3) { print(paste("parsing", lfqfilename)) lfqtab.all.tmp <- read.table(lfqfilename, sep=",", header=T, na.strings="#N/A") lfqtab.all.tmp[,"Protein.name"] <- factor(gsub(" ","",lfqtab.all.tmp[,"Protein.name"])) lfqtab.formatted.tmp <- data.frame(PrecursorMz=numeric(0), ProductMz=numeric(0), CollisionEnergy=numeric(0), PeptideSequence=character(0), ProteinName=character(0), PrecursorCharge=numeric(0), FragmentIon=character(0), ProductCharge=numeric(0), Area.Triplicate.1=numeric(0), Area.Triplicate.2=numeric(0), Area.Triplicate.3=numeric(0), Height.Triplocate.1=numeric(0), Height.Triplocate.2=numeric(0), Height.Triplocate.3=numeric(0)) peps <- levels(lfqtab.all.tmp[,"transition_group_pepseq"]) precursor.charges <- unique(as.vector(lfqtab.all.tmp[,"transition_group_charge"])) for(charge in precursor.charges) { inds.charge <- which(as.vector(lfqtab.all.tmp[,"transition_group_charge"]) == charge) for(pep in peps) { inds.pep <- which(as.vector(lfqtab.all.tmp[,"transition_group_pepseq"]) == pep) inds <- intersect(inds.charge, inds.pep) if(length(inds) > 0) { if(length(inds) != 3) print("length(inds) != 3") iontypes <- unlist(strsplit(as.character(lfqtab.all.tmp[inds[1], "transition.fragment.ion"]),",")) ### restore original code # for(ion.i in 1) for(ion.i in 1:length(iontypes)) { areas <- c() heights <- c() for(i in inds) { areas <- c(areas, as.numeric(unlist(strsplit(as.character(lfqtab.all.tmp[i, "area.per.transition"]),","))[ion.i])) heights <- c(heights, as.numeric(unlist(strsplit(as.character(lfqtab.all.tmp[i, "height.per.transition"]),","))[ion.i])) } lfqtab.formatted.tmp <- rbind(lfqtab.formatted.tmp, data.frame(PrecursorMz=1, ProductMz=1, CollisionEnergy=1, PeptideSequence=pep, ProteinName=lfqtab.all.tmp[inds[1],"Protein.name"], PrecursorCharge=charge, FragmentIon=iontypes[ion.i], ProductCharge=1, Area.Triplicate.1=areas[1], Area.Triplicate.2=areas[2], Area.Triplicate.3=areas[3], Height.Triplocate.1=heights[1], Height.Triplocate.2=heights[2], Height.Triplocate.3=heights[3])) } } } } out.lfqfilename <- paste(lfqfilename, "_lff.csv", sep="") write.table(lfqtab.formatted.tmp, file=out.lfqfilename, sep=",") lfqtab <- lfqtab.formatted.tmp print(paste("parsing", aquaqfilename)) aquaqtab <- read.table(aquaqfilename, sep=",", header=T, na.strings="#N/A") aquaqtab[,"protein.name"] <- factor(gsub(" ","",aquaqtab[,"protein.name"])) aquaq <- aquaqtab[,"abundance"] aquaq.prots <- as.vector(aquaqtab[,"protein.name"]) area.tag <- "_peakheight" if(area.flag == TRUE) area.tag <- "_peakarea" maxt <- max(sapply(levels(lfqtab[,"PeptideSequence"]), function(pep) length(which(as.vector(lfqtab[,"PeptideSequence"]) == pep)))) maxp <- max(sapply(levels(lfqtab[,"ProteinName"]), function(prot) length(unique(as.vector(lfqtab[which(as.vector(lfqtab[,"ProteinName"]) == prot), "PeptideSequence"])) ) ) ) print("generating quantification estimates for different models") # outlier characterization: times std tolerance outlier.std.tol <- 2.5 # row1: topp, row2: topt, row3: err mean.errs <- matrix(0,0,3) colnames(mean.errs) <- c("topp","topt","mean.err") for(topp in 1:maxp) # for(topp in 2) { for(topt in 1:maxt) # for(topt in 4) { print(paste(topp, "top peptides,", topt, "top transitions")) lfq <- quantify.toppep.toptrans.prots.truncated.ranking(lfqtab=lfqtab, prots=aquaq.prots, toppeps=topp, toptrans=topt, area=area.flag) # global fit q.global <- log(aquaq) lf.global <- log(lfq) lm.g <- lm(q.global ~ lf.global) # remove outliers outlier.iteration <- TRUE outlier.i <- c() inlier.i <- 1:length(aquaq) while(outlier.iteration) { lfq.gp <- predict(lm.g, data.frame(lf.global=lf.global)) lfq.gq.std <- sqrt(var(lfq.gp-q.global)) new.outlier.i <- which(abs(lfq.gp-q.global) > outlier.std.tol*lfq.gq.std) if(length(new.outlier.i) > 0) { print(paste("topp", topp, "topt", topt, ": removed", length(new.outlier.i), "outlier", "(", length(which(q.global-lfq.gp > outlier.std.tol*lfq.gq.std)), " down)")) outlier.i <- c(outlier.i, inlier.i[new.outlier.i]) inlier.i <- inlier.i[which(abs(lfq.gp-q.global) <= outlier.std.tol*lfq.gq.std)] q.global <- log(aquaq)[inlier.i] lf.global <- log(lfq)[inlier.i] lm.g <- lm(q.global ~ lf.global) } if(length(new.outlier.i) == 0) outlier.iteration <- FALSE } print("plotting training fits") pdf(paste(outprefix, "_linear_fit_topp", topp, "_topt", topt, area.tag, ".pdf", sep=""), width=14, height=7) lineseq <- seq(min(lf.global),max(lf.global), length.out=200) par(mfrow=c(1,2)) plot(exp(lf.global), exp(q.global), ylim=c(min(exp(q.global)),max(exp(q.global))), xlim=c(min(exp(lf.global)),max(exp(lf.global))), xlab="label free intensity", ylab="aqua copies/cell", main=paste("Linear Fit TopPep ", topp, ", TopTra ", topt, sep="")) par(new=T) plot(exp(lineseq), exp(predict(lm.g, data.frame(lf.global=lineseq))), ylim=c(min(exp(q.global)),max(exp(q.global))), xlim=c(min(exp(lf.global)),max(exp(lf.global))), xlab="", ylab="", type="l") plot(lf.global, q.global, ylim=c(min((q.global)),max((q.global))), xlim=c(min((lf.global)),max((lf.global))), xlab="label free log(intensity)", ylab="aqua log(copies/cell)") par(new=T) plot(lineseq, predict(lm.g, data.frame(lf.global=lineseq)), ylim=c(min((q.global)),max((q.global))), xlim=c(min((lf.global)),max((lf.global))), xlab="", ylab="", type="l") # ex.cs1 <- expression(plain(sin) * phi, paste("cos", phi)) rsq <- as.character(signif(summary(lm.g)$r.squared,3)) legend("topleft", c(paste("y = ", signif(lm.g$coefficients[2],3), "x", signif(lm.g$coefficients[1],3), sep=""), paste("r^2 = ", rsq, sep=""), paste("r = ", signif(sqrt(summary(lm.g)$r.squared),3), sep="")) ) par(mfrow=c(1,1)) dev.off() print("Monte Carlo cross validation") errs <- c() train.errs <- c() for(iter in 1:bootstrap.size) { all.ind <- 1:length(aquaq) if(length(outlier.i) > 0) all.ind <- (1:length(aquaq))[-outlier.i] train.ind <- sample(all.ind, as.integer(bootstrap.subfrac*length(all.ind))) test.ind <- setdiff(all.ind,train.ind) # fit log of quantities (since quantities are > 0) q.train <- log(aquaq[train.ind]) lf.train <- log(lfq[train.ind]) lm.q <- lm(q.train ~ lf.train) # test and expand log quantities lf.test <- exp(predict(lm.q, newdata=data.frame(lf.train=log(lfq[test.ind])))) q.test <- aquaq[test.ind] errs <- c(errs, sapply(1:length(lf.test), function(i) error.rel(pred=lf.test[i],ref=q.test[i]))) } mean.errs <- rbind(mean.errs, c(topp, topt, mean(errs))) print(paste("mean fold error:", mean(errs))) # flush current results write.table(mean.errs, file=paste(outprefix, "_mean_errs_all_topp_topt", area.tag, ".dat", sep=""), sep = "\t") pdf(paste(outprefix, "_relative_errs_topp", topp, "_topt", topt, area.tag, ".pdf", sep="")) hist(errs, main=paste("Relative Errors TopPep ", topp, ", TopTra ", topt, " (mean = ", signif(mean(errs),3), ", std err = ", signif(sqrt(var(errs))/length(errs),3), ")", sep=""), nclass=floor(bootstrap.size/30), xlab="relative error") abline(v=mean(errs), col="red", lwd=3) dev.off() write.table(errs, file=paste(outprefix, "_relative_errs_topp", topp, "_topt", topt, area.tag, ".tsv", sep=""), sep = "\t") } } print("plot summary results") prefix = paste(outprefix, "_mean_errs_all_topp_topt", area.tag, sep="") result.t <- read.table(paste(prefix, ".dat", sep=""), header=TRUE) trans.count <- max(as.vector(result.t[,"topt"])) pep.count <- max(as.vector(result.t[,"topp"])) err.t <- matrix(data=0, nrow=trans.count, ncol=pep.count) rownames(err.t) <- 1:trans.count colnames(err.t) <- 1:pep.count max.err <- max(as.vector(result.t[,"mean.err"])) for(tc in 1:trans.count) { for(pc in 1:pep.count) { ir <- intersect(which(result.t[,"topt"] == tc), which(result.t[,"topp"] == pc)) [1] if(length(ir) != 0) err.t[tc,pc] <- result.t[ir,"mean.err"] if(length(ir) == 0) err.t[tc,pc] <- max.err } } pdf(paste(prefix, "_heatplot.pdf", sep="")) plot(levelplot(err.t, xlab="transition count", ylab="peptide count", col.regions=c(tim.colors(2048)[1025:2048]))) dev.off() }