require(lattice)
require(fields)

#  
error <- function(pred, ref)
{
	e <- abs(pred-ref)
	e
}

error.rel <- function(pred, ref, sym=TRUE)
{
	e <- pred/ref
	if(sym & (e < 1)) e <- 1/e
	e
}


quantify.toppep.toptrans.prots.truncated.ranking <- function(lfqtab, prots, toppeps, toptrans, area=TRUE, log=FALSE, repl=0)
{
	qs <- as.vector(sapply(prots, function(prot) quantify.toppep.toptrans.truncated.ranking(lfqtab=lfqtab, prot=prot, toppeps=toppeps, toptrans=toptrans, area=area, log=log, repl=repl)))
	
	qs
}


quantify.toppep.toptrans.truncated.ranking <- function(lfqtab, prot, toppeps, toptrans, area=TRUE, log=FALSE, repl=0)
{

	# 	lfqtab is supposed to be a table with cols
	#  [1] "PrecursorMz"         "ProductMz"           "CollisionEnergy"    
	#  [4] "PeptideSequence"     "ProteinName"         "PrecursorCharge"    
	#  [7] "FragmentIon"         "ProductCharge"       "Area.Triplicate.1"  
	# [10] "Area.Triplicate.2"   "Area.Triplicate.3"   "Height.Triplocate.1"
	# [13] "Height.Triplocate.2" "Height.Triplocate.3"	
	
	
	q <- 0
	

	# get subtable with entries for protein prot
	prottab <- lfqtab[which(as.vector(lfqtab[,"ProteinName"]) == prot),]
	# check if protein prot is reported in lfqtab
	if(nrow(prottab) == 0) {print(paste("Did not find protein", prot, "in quantification table")); return(q)}
	
	
	# quantification cols start at 9 for area, or at 12 for peak height
	qcoloffset <- 8
	if(area == FALSE) qcoloffset <- qcoloffset + 3
	
	considered.repl.cols <- (qcoloffset+1):(qcoloffset+3)
	if(repl > 0 && repl < 4) considered.repl.cols <- qcoloffset+repl
		
	
	# peptides pointing to prot
	
		peps <- unique(as.vector(prottab[,"PeptideSequence"]))
# 		print(peps)
		
		# total peptide quantities -- estimated by the sum of the first topt transitions 
		pep.tot.quants <- sapply(peps, function(pep) {sum(head(sort(sapply(which(as.vector(prottab[,"PeptideSequence"]) == pep), function(prow) {mean(sapply(considered.repl.cols, function(qcol) prottab[prow,qcol]), na.rm=TRUE)}), decreasing=TRUE ), toptrans))})
		
		# sort in descending order
		peps.order <- order(pep.tot.quants, decreasing=TRUE)
		peps.sorted <- peps[peps.order]
		pep.tot.quants.sorted <- pep.tot.quants[peps.order]
		
		# considered peptides
		considered.peps <- head(peps.sorted, toppeps)
		considered.pep.tot.quants <- head(pep.tot.quants.sorted, toppeps)
		
		# determine considered transitions for considered peptides
		# determine labelfree quantitation estimates by peptide
		
		lfq <- sapply(considered.peps, function(pep) {sum(head(sort(sapply(which(as.vector(prottab[,"PeptideSequence"]) == pep), function(prow) {mean(sapply(considered.repl.cols, function(qcol) prottab[prow,qcol]), na.rm=TRUE)}), decreasing=TRUE), toptrans))})
		
		# determine labelfree estimate for protein 
		q <- sum(lfq)

	
	if(log==TRUE) q <- log(q)
	
	q

}


# model selection for label free quantification with srm
# parameters
#
# lfqfilename		filename for mrmprophet file with transition intensity information
# aquaqfilename		filename for reference (e.g. aqua) protein abundances
# area.flag			flag indicating whether transition intensities are estimated from area (or height otherwise)
# outprefix			prefix for output files
# bootstrap.size	number of repetitions of monte carlo cv
# bootstrap.subfrac relative size of training fold in each monte carlo cv repeat
lf_srm_quant <- function(lfqfilename, aquaqfilename, area.flag, outprefix, bootstrap.size=1000, bootstrap.subfrac=2/3)
{
	

print(paste("parsing", lfqfilename))

lfqtab.all.tmp <- read.table(lfqfilename, sep=",", header=T, na.strings="#N/A")
lfqtab.all.tmp[,"Protein.name"] <- factor(gsub(" ","",lfqtab.all.tmp[,"Protein.name"]))

lfqtab.formatted.tmp <- data.frame(PrecursorMz=numeric(0), ProductMz=numeric(0), CollisionEnergy=numeric(0), PeptideSequence=character(0), ProteinName=character(0), PrecursorCharge=numeric(0), FragmentIon=character(0), ProductCharge=numeric(0), Area.Triplicate.1=numeric(0), Area.Triplicate.2=numeric(0), Area.Triplicate.3=numeric(0), Height.Triplocate.1=numeric(0), Height.Triplocate.2=numeric(0), Height.Triplocate.3=numeric(0))


peps <- levels(lfqtab.all.tmp[,"transition_group_pepseq"])
precursor.charges <- unique(as.vector(lfqtab.all.tmp[,"transition_group_charge"]))
for(charge in precursor.charges)
{
	inds.charge <- which(as.vector(lfqtab.all.tmp[,"transition_group_charge"]) == charge)
	for(pep in peps)
	{
		inds.pep <- which(as.vector(lfqtab.all.tmp[,"transition_group_pepseq"]) == pep)
		inds <- intersect(inds.charge, inds.pep)
		
		if(length(inds) > 0)
		{
			if(length(inds) != 3) print("length(inds) != 3")
			
			iontypes <- unlist(strsplit(as.character(lfqtab.all.tmp[inds[1], "transition.fragment.ion"]),","))
			
			
			### restore original code
# 			for(ion.i in 1)
			for(ion.i in 1:length(iontypes))
			{
				areas <- c()
				heights <- c()
				for(i in inds)
				{	
					areas <- c(areas, as.numeric(unlist(strsplit(as.character(lfqtab.all.tmp[i, "area.per.transition"]),","))[ion.i]))
					
					heights <- c(heights, as.numeric(unlist(strsplit(as.character(lfqtab.all.tmp[i, "height.per.transition"]),","))[ion.i]))
				}
 				
				lfqtab.formatted.tmp <- rbind(lfqtab.formatted.tmp, data.frame(PrecursorMz=1, ProductMz=1, CollisionEnergy=1, PeptideSequence=pep, ProteinName=lfqtab.all.tmp[inds[1],"Protein.name"], PrecursorCharge=charge, FragmentIon=iontypes[ion.i], ProductCharge=1, Area.Triplicate.1=areas[1], Area.Triplicate.2=areas[2], Area.Triplicate.3=areas[3], Height.Triplocate.1=heights[1], Height.Triplocate.2=heights[2], Height.Triplocate.3=heights[3]))
		
			}
		}
	}
}

out.lfqfilename <- paste(lfqfilename, "_lff.csv", sep="")
write.table(lfqtab.formatted.tmp, file=out.lfqfilename, sep=",")

lfqtab <- lfqtab.formatted.tmp
	

print(paste("parsing", aquaqfilename))
	
aquaqtab <- read.table(aquaqfilename, sep=",", header=T, na.strings="#N/A")
aquaqtab[,"protein.name"] <- factor(gsub(" ","",aquaqtab[,"protein.name"]))

aquaq <- aquaqtab[,"abundance"]
aquaq.prots <- as.vector(aquaqtab[,"protein.name"])

	
area.tag <- "_peakheight"
if(area.flag == TRUE) area.tag <- "_peakarea"


maxt <- max(sapply(levels(lfqtab[,"PeptideSequence"]), function(pep) length(which(as.vector(lfqtab[,"PeptideSequence"]) == pep))))

maxp <- max(sapply(levels(lfqtab[,"ProteinName"]), function(prot) length(unique(as.vector(lfqtab[which(as.vector(lfqtab[,"ProteinName"]) == prot), "PeptideSequence"])) ) ) )  


print("generating quantification estimates for different models")

# outlier characterization: times std tolerance
outlier.std.tol <- 2.5


# row1: topp, row2: topt, row3: err
mean.errs <- matrix(0,0,3)
colnames(mean.errs) <- c("topp","topt","mean.err")

for(topp in 1:maxp)
# for(topp in 2)
{
for(topt in 1:maxt)
# for(topt in 4)
{

print(paste(topp, "top peptides,", topt, "top transitions"))

lfq <- quantify.toppep.toptrans.prots.truncated.ranking(lfqtab=lfqtab, prots=aquaq.prots, toppeps=topp, toptrans=topt, area=area.flag)

# global fit
q.global <- log(aquaq)
lf.global <- log(lfq)

lm.g <- lm(q.global ~ lf.global)
	
# remove outliers
outlier.iteration <- TRUE
outlier.i <- c()
inlier.i <- 1:length(aquaq)
while(outlier.iteration)
{
	lfq.gp <- predict(lm.g, data.frame(lf.global=lf.global))
	lfq.gq.std <- sqrt(var(lfq.gp-q.global))
	
	new.outlier.i <- which(abs(lfq.gp-q.global) > outlier.std.tol*lfq.gq.std)
	if(length(new.outlier.i) > 0)
	{
		
		print(paste("topp", topp, "topt", topt, ": removed", length(new.outlier.i), "outlier", "(", length(which(q.global-lfq.gp > outlier.std.tol*lfq.gq.std)), " down)"))
		outlier.i <- c(outlier.i, inlier.i[new.outlier.i])
		inlier.i <- inlier.i[which(abs(lfq.gp-q.global) <= outlier.std.tol*lfq.gq.std)]
		
		q.global <- log(aquaq)[inlier.i]
		lf.global <- log(lfq)[inlier.i]
		lm.g <- lm(q.global ~ lf.global)
	}
	if(length(new.outlier.i) == 0) outlier.iteration <- FALSE
	
}


print("plotting training fits")
pdf(paste(outprefix, "_linear_fit_topp", topp, "_topt", topt, area.tag, ".pdf", sep=""), width=14, height=7)
lineseq <- seq(min(lf.global),max(lf.global), length.out=200)
par(mfrow=c(1,2))
plot(exp(lf.global), exp(q.global), ylim=c(min(exp(q.global)),max(exp(q.global))), xlim=c(min(exp(lf.global)),max(exp(lf.global))), xlab="label free intensity", ylab="aqua copies/cell", main=paste("Linear Fit TopPep ", topp, ", TopTra ", topt, sep=""))
par(new=T)
plot(exp(lineseq), exp(predict(lm.g, data.frame(lf.global=lineseq))), ylim=c(min(exp(q.global)),max(exp(q.global))), xlim=c(min(exp(lf.global)),max(exp(lf.global))), xlab="", ylab="", type="l")

plot(lf.global, q.global, ylim=c(min((q.global)),max((q.global))), xlim=c(min((lf.global)),max((lf.global))), xlab="label free log(intensity)", ylab="aqua log(copies/cell)")
par(new=T)
plot(lineseq, predict(lm.g, data.frame(lf.global=lineseq)), ylim=c(min((q.global)),max((q.global))), xlim=c(min((lf.global)),max((lf.global))), xlab="", ylab="", type="l")
# ex.cs1 <- expression(plain(sin) * phi,  paste("cos", phi))
rsq <- as.character(signif(summary(lm.g)$r.squared,3))
legend("topleft", c(paste("y = ", signif(lm.g$coefficients[2],3), "x", signif(lm.g$coefficients[1],3), sep=""), paste("r^2  = ", rsq, sep=""), paste("r = ", signif(sqrt(summary(lm.g)$r.squared),3), sep="")) )
par(mfrow=c(1,1))
dev.off()


print("Monte Carlo cross validation")
errs <- c()
train.errs <- c()
for(iter in 1:bootstrap.size)
{
	all.ind <- 1:length(aquaq)
	if(length(outlier.i) > 0) all.ind <- (1:length(aquaq))[-outlier.i]
	
	train.ind <- sample(all.ind, as.integer(bootstrap.subfrac*length(all.ind)))
	test.ind <- setdiff(all.ind,train.ind)

	# fit log of quantities (since quantities are > 0)
	q.train <- log(aquaq[train.ind])
	lf.train <- log(lfq[train.ind])
	lm.q <- lm(q.train ~ lf.train)
	
		
	# test and expand log quantities 
	lf.test <- exp(predict(lm.q, newdata=data.frame(lf.train=log(lfq[test.ind]))))
	q.test <- aquaq[test.ind]
	
	errs <- c(errs, sapply(1:length(lf.test), function(i) error.rel(pred=lf.test[i],ref=q.test[i])))
	
}


mean.errs <- rbind(mean.errs, c(topp, topt, mean(errs)))
print(paste("mean fold error:", mean(errs)))
# flush current results
write.table(mean.errs, file=paste(outprefix, "_mean_errs_all_topp_topt", area.tag, ".dat", sep=""), sep = "\t")

pdf(paste(outprefix, "_relative_errs_topp", topp, "_topt", topt, area.tag, ".pdf", sep=""))
hist(errs, main=paste("Relative Errors TopPep ", topp, ", TopTra ", topt, " (mean = ", signif(mean(errs),3), ", std err = ", signif(sqrt(var(errs))/length(errs),3), ")", sep=""), nclass=floor(bootstrap.size/30), xlab="relative error")
abline(v=mean(errs), col="red", lwd=3)
dev.off()

write.table(errs, file=paste(outprefix, "_relative_errs_topp", topp, "_topt", topt, area.tag, ".tsv", sep=""), sep = "\t")


}

}

print("plot summary results")

prefix = paste(outprefix, "_mean_errs_all_topp_topt", area.tag, sep="")
result.t <- read.table(paste(prefix, ".dat", sep=""), header=TRUE)

trans.count <- max(as.vector(result.t[,"topt"]))
pep.count <- max(as.vector(result.t[,"topp"]))
err.t <- matrix(data=0, nrow=trans.count, ncol=pep.count)
rownames(err.t) <- 1:trans.count
colnames(err.t) <- 1:pep.count

max.err <- max(as.vector(result.t[,"mean.err"]))
	
for(tc in 1:trans.count)
{
for(pc in 1:pep.count)
{
	ir <- intersect(which(result.t[,"topt"] == tc), which(result.t[,"topp"] == pc))  [1]

	if(length(ir) != 0) err.t[tc,pc] <- result.t[ir,"mean.err"]
	if(length(ir) == 0) err.t[tc,pc] <- max.err
}
}


pdf(paste(prefix, "_heatplot.pdf", sep=""))
plot(levelplot(err.t, xlab="transition count", ylab="peptide count", col.regions=c(tim.colors(2048)[1025:2048])))
dev.off()


}