#' Read xlsx, csv or tsv files exported from MS-Angel and Proline
#'
#' Quantification results form \href{https://www.maxquant.org/}{MaxQuant} can be read using this function and relevant information extracted. 
#' The final output is a list containing these elements: \code{$raw}, \code{$quant}, \code{$annot}, \code{$counts}, \code{$quantNotes}, \code{$notes}, or (if \code{separateAnnot=FALSE}) data.frame 
#'   with annotation- and main quantification-content.
#' @details
#' This function has been developed using MaxQuant versions 1.6.10.x to 1.6.17.x, the format of resulting file 'proteinGroups.txt' is typically well conserved.  
#' 
#' @param path (character) path of file to be read
#' @param fileName (character) name of file to be read (default 'proteinGroups.txt' as typically generated by MaxQuant in txt folder) 
#' @param normalizeMeth (character) normalization method (will be sent to  \code{\link[wrMisc]{normalizeThis}}) 
#' @param quantCol (character or integer) exact col-names, or if length=1 content of \code{quantCol} will be used as pattern to search among column-names for $quant using \code{grep} 
#' @param contamCol (character or integer, length=1) which columns should be used for contaminants marked by ProteomeDiscoverer
#' @param pepCountCol (character) pattern to search among column-names for count data of PSM and NoOfPeptides
#' @param uniqPepPat (character, length=1) depreciated, please use \code{pepCountCol} instead
#' @param extrColNames (character) column names to be read (1: prefix for LFQ quantitation, default 'LFQ.intensity'; 2: column name for protein-IDs, default 'Majority.protein.IDs'; 3: column names of fasta-headers, default 'Fasta.headers', 4: column name for number of protein IDs matching, default 'Number.of.proteins')
#' @param specPref (character) prefix to identifiers allowing to separate i) recognize contamination database, ii) species of main identifications and iii) spike-in species
#' @param refLi (character or integer) custom specify which line of data is main species, if character (eg 'mainSpe'), the column 'SpecType' in $annot will be searched for exact match of the (single) term given
#' @param remRev (logical) option to remove all protein-identifications based on reverse-peptides
#' @param separateAnnot (logical) if \code{TRUE} output will be organized as list with \code{$annot}, \code{$abund} for initial/raw abundance values and \code{$quant} with final normalized quantitations
#' @param plotGraph (logical) optional plot vioplot of initial and normalized data (using \code{normalizeMeth}); alternatively the argument may contain numeric details that will be passed to \code{layout} when plotting
#' @param tit (character) custom title to plot
#' @param wex (numeric)  relative expansion factor of the violin in plot
#' @param silent (logical) suppress messages
#' @param callFrom (character) allow easier tracking of message produced
#' @return list with  \code{$raw} (initial/raw abundance values), \code{$quant} with final normalized quantitations, \code{$annot} (columns ), \code{$counts} an array with 'PSM' and 'NoOfRazorPeptides', \code{$quantNotes} and \code{$notes}; or a data.frame with quantitation and annotation if \code{separateAnnot=FALSE}
#' @seealso \code{\link[utils]{read.table}}, \code{\link[wrMisc]{normalizeThis}}) , \code{\link{readProlineFile}} 
#' @examples
#' path1 <- system.file("extdata",package="wrProteo")
#' # Here we'll load a short/trimmed example file (thus not MaxQuant default name) 
#' fiNa <- "proteinGroupsMaxQuantUps1.txt"
#' specPref1 <- c(conta="conta|CON_|LYSC_CHICK", mainSpecies="YEAST",spike="HUMAN_UPS")
#' dataMQ <- readMaxQuantFile(path1, file=fiNa, specPref=specPref1)
#' summary(dataMQ$quant)
#' matrixNAinspect(dataMQ$quant, gr=gl(3,3)) 
#' @export
readMaxQuantFile <- function(path,fileName="proteinGroups.txt",normalizeMeth="median", quantCol="LFQ.intensity",contamCol="Potential.contaminant",
  pepCountCol=c("Peptides.","Razor...unique.peptides."), uniqPepPat=NULL, refLi=NULL,          
  extrColNames=c("Majority.protein.IDs","Fasta.headers","Number.of.proteins"), specPref=c(conta="conta|CON_|LYSC_CHICK",mainSpecies="OS=Homo sapiens"),
  remRev=TRUE, separateAnnot=TRUE, tit=NULL, wex=1.6, plotGraph=TRUE, silent=FALSE, callFrom=NULL) {
  ## prepare  
  fxNa <- wrMisc::.composeCallName(callFrom, newNa="readMaxQuantFile")
  opar <- graphics::par(no.readonly=TRUE)      
  remRev <- TRUE
  if(!file.exists(path)) stop(" Can't find path '",path,"'")
  chFi <- file.exists(file.path(path,fileName))
  if(!chFi) stop(" file  ",fileName," was NOT found in path ",path," !")
  if(length(unlist(sapply(c("\\.txt$","\\.txt.gz$"), grep, fileName))) <1) message(fxNa," Suspicious filename, this function was designed for reading tabulated text files produced by MaxQuant")
  chPa <- try(find.package("utils"),silent=TRUE)
  if("try-error" %in% class(chPa)) stop("package 'utils' not found ! Please install first")   
  ## initial read MaxQuant
  tmp <- utils::read.delim(file.path(path,fileName), stringsAsFactors=FALSE)
  if(length(uniqPepPat) >1) message(fxNa," NOTE: argument 'uniqPepPat' is prepeciated (and it's content ignored), please use 'pepCountCol' instead !")
  ## check which columns can be extracted (for annotation)
  if(is.integer(contamCol)) contamCol <- colnames(tmp)[contamCol]   
  extrColNames <- union(extrColNames, contamCol)                     # add contamCol if not included in extrColNames
  chCol <- extrColNames %in% colnames(tmp)
  if(!any(chCol)) { extrColNames <- gsub("\\."," ",extrColNames)
    chCol <- extrColNames %in% colnames(tmp) }
  if(all(!chCol)) stop(" Problem locating annotation columns (",wrMisc::pasteC(extrColNames,quote="''"),")")
  if(any(!chCol) ) {
    if(!silent) message(fxNa," can't find columns ",wrMisc::pasteC(extrColNames[!chCol]),quote="'")
  }  
  ## 'REVERSE' peptides
  chMajProCol <- extrColNames[1] %in% colnames(tmp)
  if(chMajProCol) {
    chRev <- grep("REV__",tmp[,extrColNames[1]])
    if(length(chRev) >0) {
      if(!silent) message(fxNa," Note: Found ",length(chRev)," proteins marked as 'REV_' (reverse peptide identification)", if(remRev) " - Removing")
      if(remRev) tmp <- tmp[-1*chRev,]  
    }}
  ## further extracting : quantitation  
  grepX <- function(x) grep(paste(x,"\\.",sep=""),colnames(tmp))
  useDCol <- if(length(quantCol)==1) grepX(quantCol) else unique(as.integer(sapply(quantCol,grepX)))
  if(length(useDCol) <1) stop("no columns matching terms from 'quantCol' found")
  MQdat <- as.matrix(tmp[,useDCol])
  quantColP <- NULL                           # initialize
  if(length(quantCol) <1) stop(" 'quantCol' must be provided !")
  if(length(quantCol) >1) { abund <- as.matrix(wrMisc::extrColsDeX(tmp, extrCol=quantCol, doExtractCols=TRUE, callFrom=fxNa))
  } else { chP <- substr(quantCol, nchar(quantCol), nchar(quantCol)) != "."
    quantColP <- quantCol
    quantCol <- if(chP) grep(paste(quantCol,"\\.",sep=""), colnames(tmp)) else grep(quantCol,colnames(tmp)) 
    chNa <- is.na(quantCol)
    if(all(chNa)) stop("Could not find any of the columns specified in argument 'quantCol' !")
    if(any(chNa)) { 
      if(!silent) message(fxNa," Could not find columns ",wrMisc::pasteC(quantCol[which(chNa)],quote="'")," .. omit")
      quantCol <- wrMisc::naOmit(quantCol)} 
    abund <- as.matrix(tmp[,quantCol]) }           # abundance val
  chNum <- is.numeric(abund)
  if(!chNum) {abund <- apply(tmp[,quantCol], 2, wrMisc::convToNum,convert="allChar",callFrom=fxNa)}
  if(length(dim(abund)) <2 & !is.numeric(abund)) abund <- matrix(as.numeric(abund), ncol=ncol(abund), dimnames=dimnames(abund))
  colnames(abund) <- if(length(quantColP)==1) sub(paste(quantColP,"\\.",sep=""),"", colnames(abund)) else wrMisc::.trimFromStart(wrMisc::.trimFromEnd(colnames(abund)))
  ## convert 0 to NA
  abund[which(abund <= 0)] <- NA
  ## further extracting : annotation
  if(length(grep("\\\\",pepCountCol)) <1) pepCountCol <- gsub("\\.","\\\\.",pepCountCol)       # protect '.'  "
  useACol <- list(annC=match(extrColNames,colnames(tmp)), razC=if(length(pepCountCol) >0) grep(pepCountCol[1],colnames(tmp)), psm=if(length(pepCountCol) >1) grep(pepCountCol[2],colnames(tmp)))
  useACol <- lapply(useACol,wrMisc::naOmit)
  ## make array of PSM counts etc
  usC <- sapply(useACol[2:3],length) >0 
  if(any(usC)) {
    counts <- array(dim=c(nrow(tmp),ncol(abund),sum(usC)), dimnames=list(NULL,colnames(abund),c("PSM","NoOfRazorPeptides")[which(usC)]))
    for(i in 1:sum(usC)) counts[,,i] <- as.numeric(as.matrix(tmp[,useACol[[1+ which(usC)[i]]] ]))
  } else counts <- NULL
  MQann <- as.matrix(tmp[,useACol$annC])  

  .extrAnno <- function(spIn=2,spc=specPref,anno=MQann,exCoNa=extrColNames,extrCoIn=2) {
    ## extract specific annotation terms out of matrix  anno
    ## 'spc' .. search-term(s)
    if(length(spc) >= spIn) {
      if(spIn ==1){
        union(grep(spc[spIn], anno[,exCoNa[extrCoIn]]), which(gsub(" ","",anno[,exCoNa[extrCoIn]])=="+"))
      } else {
        ch1 <- unlist(spc[spIn])  
        if(length(ch1) >1) unique(sapply(ch1, function(x) grep(x,MQann[,exCoNa[extrCoIn]]))) else grep(spc[spIn], MQann[,exCoNa[extrCoIn]]) 
      }
    } else NULL }
  specMQ0 <- lapply(1:length(specPref),.extrAnno)
  names(specMQ0) <- c("conta","mainSpe","species2","species3","species4")[1:length(specMQ0)]  

  if(length(specMQ0$mainSpe) <1) {
    specMQ0$mainSpe <- (1:nrow(tmp))[-1*unlist(specMQ0)]
    if(!silent) message(fxNa," Problem with 'mainSpecies' (none found), assuming that all ",length(specMQ0$mainSpe)," non-assigned proteins are 'mainSpecies' ")
  }   
  chTa <- table(table(unlist(specMQ0))) 
  specMQ <- rep(NA,nrow(abund))
  for(i in 1:length(specMQ0)) if(length(specMQ0[[i]]) >0) specMQ[specMQ0[[i]]] <- names(specMQ0)[i]
  if(!silent) { chSp <- sum(is.na(specMQ))
    if(chSp >0) message(fxNa," Note: ",chSp," proteins with unknown species")
    tab <- table(specMQ)
    tab <- rbind(names(tab),": ",tab,"  ")
    if(!silent) message("   by species : ",apply(tab,2,paste)) }             # all lines assigned   
  MQann <- cbind(SpecType=specMQ,MQann)                                          # better to name column 'species' ?? 
   
  ## split Annotation
  remHeader <- c("^conta\\|","^sp\\|") 
  MQan2 <- strsplit(sub(remHeader[1],"",sub(remHeader[2],"", MQann[,"Majority.protein.IDs"])),"\\|" )
  #MQan2 <- strsplit(MQann[,"Majority.protein.IDs"],"\\|" )
  MQanLe <- sapply(MQan2, length)
  MQan3 <- matrix(NA, nrow=nrow(MQann), ncol=2, dimnames=list(NULL,c("Accession","EntryName")))
  chLe <- MQanLe==1
  if(any(chLe)) MQan3[which(chLe),1] <- unlist(MQan2[which(chLe)])
  chLe <- MQanLe==2
  if(any(chLe)) MQan3[which(chLe),] <- matrix(unlist(MQan2[which(chLe)]), ncol=2, byrow=TRUE)
  chLe <- MQanLe >2
  locAccNo <- function(x) {      # function to select AccessionNumner (eg P02768) and EntryName (eg ALBU_HUMAN) after strsplit() of concatenated annotation
    accIn <- grep("^[[:upper:]]+[[:digit:]]+$|^[[:upper:]]+[[:digit:]]+\\-[[:digit:]]+$", x)
    namId <- grep("[[:upper:]]_[[:upper:]]", x)
    useInd <- c(acc=if(length(accIn) >0) accIn[1] else NA, name=if(length(namId) >0) namId[1] else NA)
    chNA <- is.na(useInd)
    if(any(chNA)) useInd[which(chNA)] <- (1:length(x))[-1*wrMisc::naOmit(unique(c(namId,useInd)))][1:sum(chNA)]
    x[useInd] }
  if(any(chLe)) MQan3[which(chLe),] <- t(sapply(MQan2[which(chLe)], locAccNo ))
  chSemc <- grep(";", MQan3[,2])                                  # look for semicolon separator  (eg "CATA_HUMAN_UPS;conta")
  if(length(chSemc) >0) MQan3[chSemc,2] <- sub(";[[:print:]]+","",MQan3[chSemc,2])       # remove all after semicolon (eg "CATA_HUMAN_UPS;conta")
  ## contaminants (fuse from column 'Potential.contaminant' and those found via specPref[1])
  contam <- rep(FALSE,nrow(MQann))
  if("Potential.contaminant" %in% colnames(MQann)) {chCo <- "+" %in% sub("^ ","",MQann[,"Potential.contaminant"])
    if(any(chCo)) contam[which(chCo)] <- TRUE }
  if(length(specMQ0$conta) >0) contam[specMQ0$conta] <- TRUE     ## from 'specPref' search
  ## extract/add GN
  MQan3 <- cbind(MQan3,GN=NA)
  GNLi <- grep("\\ GN=[[:upper:]]{2,}", MQann[,"Fasta.headers"])
    if(length(GNLi) >0) { zz <- sub("[[:print:]]+\\ GN=", "",MQann[GNLi,"Fasta.headers"])   # remove surplus to left
      MQan3[GNLi,"GN"] <- sub("\\ +$","",sub("\\ [[:print:]]+","",zz)) }                    # remove surplus to right (and right trailing space)
  
  ## finalize annotation 
  MQann <- cbind(MQan3, Species=NA, Contam=contam, MQann)
  ## extract species according to custom search parameters 'specPref'
  .annSpecies <- function(spe=c("_HUMAN","Homo sapiens"), anno=MQann, exCoNa=extrColNames) {
    ## extract species tags out of MQann[,"Majority.protein.IDs"], place as convert to regular name in anno, return matrix anno
    ch1 <- grep(spe[1],anno[,exCoNa[2]])
    if(length(ch1) >0) anno[ch1,"Species"] <- spe[2]  #"Homo sapiens"
    anno }
  commonSpec <- cbind(c("_HUMAN","_MOUSE","_RAT","_PIG","_YEAST"),c("Homo sapiens","Mus muscullus","Rattus norvegicus","Sus scrofa","Saccharomyces cerevisae"))
  for(i in 1:nrow(commonSpec)) MQann <- .annSpecies(commonSpec[i,],MQann,exCoNa=extrColNames)
  
  ## now complete (overwrite) by info extracted from fasta : ' OS='
  chSpe <- grep(" OS=[[:upper:]][[:lower:]]+", MQann[,"Fasta.headers"])    
  if(length(chSpe) >0) { OS <- sub(" \\(strain [[:print:]]+| [[:upper:]]{2}=[[:print:]]+", "", sub("[[:print:]]+OS=","",MQann[chSpe,"Fasta.headers"]))
    ch1 <- grep("^[[:upper:]][[:lower:]]+\\ [[:lower:]]+\\ [[:punct:]]",OS)
    if(any(ch1)) OS[ch1] <- sub("\\ [[:punct:]][[:print:]]*$","",OS[ch1]) 
    MQann[chSpe,"Species"] <- OS }
  ## remve text like 'XX=xxx' from [,"Species"] to remove strain name (most likely duplicating information)
  remStrainNo <- TRUE
  if(remStrainNo) MQann[chSpe,"Species"] <- sub(" [[:upper:]]{2}=[[:print:]]+", "", MQann[chSpe,"Species"])
  ## remove remaining tailing semicolon to comma
  ch1 <- grep(";$|,$",MQann[,"Species"])
  if(length(ch1) >0) MQann[ch1,"Species"] <- sub(";+$|,*$","",MQann[ch1,"Species"])  
  ## check for truncated species names (ie names found) inside other ones
  OSna <- unique(MQann[chSpe,"Species"])
  ch1 <- lapply(OSna,grep,OSna)
  chTr <- sapply(ch1,length) >1
  if(any(chTr)) { if(!silent) message(fxNa,"Found ",sum(chTr)," species name(s) appearing inside other ones, assume as truncated (eg  ",OSna[which(chTr)[1]],")") 
    for(i in which(chTr)) MQann[match(OSna[i],MQann[,"Species"]),"Species"] <- OSna[ch1[[i]][-which(ch1[[i]]==i)]] }
  ## check for composite Accession names, keep only 1st part
  ch1 <- grep(",|;|_|\\(|\\|", MQann[,1])    # note: need to not exclude/mark '-'
  if(length(ch1) >0) {
    if(!silent) message(fxNa,"Found ",length(ch1)," composite accession-numbers, truncating (eg ",MQann[ch1[1],1],")")
    ch2 <- grep("^CON_",MQann[ch1,1])        # if composite Acc number starts with CON_, remove this part and rather keep other entry
    if(length(ch2) >0) {
      MQann[ch1[ch2],1] <- sub("^CON_[[:punct:]]*","",MQann[ch1[ch2],1]) }
      #MQann[ch1[ch2],1] <- sub("^CON_[[:punct:]]*[[:upper:]]+[[:digit:]]+[[:punct:]]","",MQann[ch1[ch2],1]) }
    ch2 <- grep("[[:alnum:]]+;{0,1}CON__",MQann[ch1,1])              # if composite Acc number is followed by ;CON__ (eg "P02768;CON__P02768-1" or ""P02769CON__A1B2"), keep 1st part
    if(length(ch2) >0) MQann[ch1[ch2],1] <- sub(";{0,1}CON__[[:print:]]*","",MQann[ch1[ch2],1]) 
    ch1 <- grep(",|;|_|\\(|\\|", MQann[,1])
    if(length(ch1) >0) MQann[ch1,1] <- sub(paste(paste0(c(",",";","_","\\(","\\|"),"[[:print:]]*"), collapse="|"), "", MQann[ch1,1]) } 

  ## look for unique col from $annot to use as rownames
  chAn <- colSums(apply(MQann[,c(1:min(ncol(MQann),7))], 2, duplicated), na.rm=TRUE)          # look at first 6 cols : how many elements per column duplicated 
  if(!silent) message(fxNa,"Use column '",colnames(MQann)[which.min(chAn)],"' (has fewest, ie ",chAn[which.min(chAn)]," duplicated entries) as rownames") 
  rownames(abund) <- rownames(MQann) <- rownames(counts) <- if(any(chAn==0)) MQann[,which(chAn==0)[1]] else wrMisc::correctToUnique(MQann[,which.min(chAn)], callFrom=fxNa)  
  
  ## check for reference for normalization
  refLiIni <- refLi
  if(is.character(refLi) & length(refLi)==1) { refLi <- which(MQann[,"SpecType"]==refLi)
    if(length(refLi) <1) message(fxNa," could not find any protein matching argument 'refLi', ignoring ...") else {
      if(!silent) message(fxNa," normalize using subset of ",length(refLi))}}           # may be "mainSpe"
  if(length(refLi) <1) refLi <- NULL
  ## take log2 & normalize
  abundN <- wrMisc::normalizeThis(log2(abund), method=normalizeMeth, refLines=refLi, callFrom=fxNa)       #

  ## plot distribution of intensities
  custLay <- NULL
  if(length(plotGraph) >0) {if(is.numeric(plotGraph)) {custLay <- plotGraph; plotGraph <- TRUE
    } else  {plotGraph <- as.logical(plotGraph[1])}}
  if(plotGraph){
    if(length(custLay) >0) graphics::layout(custLay) else graphics::layout(1:2)
    graphics::par(mar=c(3, 3, 3, 1))                           # mar: bot,le,top,ri
    chGr <- try(find.package("wrGraph"),silent=TRUE)
    chSm <- try(find.package("sm"),silent=TRUE)
    misPa <- c("try-error" %in% class(chGr),"try-error" %in% class(chSm))
    if(is.null(tit)) tit <- "MaxQuant Quantification "    
    titSu <- if(length(refLi) >0) paste0(c(" by ",if(length(refLiIni) >1) c(length(refLi)," selected lines") else c("'",refLiIni,"'")),collapse="")  else NULL
    if(any(misPa)) { 
      if(!silent) message(fxNa," missing package ",wrMisc::pasteC(c("wrGraph","sm")[which(misPa)],quoteC="'")," for drawing vioplots")
      ## wrGraph not available : simple boxplot  
      graphics::boxplot(log2(abund), main=paste(tit," (initial)"), las=1, outline=FALSE)
      graphics::abline(h=round(stats::median(log2(abund),na.rm=TRUE)) +(-1:1), lty=2, col=grDevices::grey(0.6)) 
      ## now normalized
      graphics::boxplot(abundN,main=paste(tit," (",normalizeMeth,"-normalized",titSu,")"), las=1, outline=FALSE)
      graphics::abline(h=round(stats::median(abundN,na.rm=TRUE)) +(-1:1), lty=2, col=grDevices::grey(0.6))
    } else {                                                  # wrGraph and sm are available
      wrGraph::vioplotW(log2(abund), tit=paste(tit," (initial)"), wex=wex) 
      graphics::abline(h=round(stats::median(log2(abund),na.rm=TRUE)) +(-1:1), lty=2, col=grDevices::grey(0.6)) 
      ## now normalized
      wrGraph::vioplotW((abundN), tit=paste(tit," , ",normalizeMeth,"-normalized",titSu), wex=wex)
      graphics::abline(h=round(stats::median(abundN,na.rm=TRUE)) +(-1:1), lty=2, col=grDevices::grey(0.6))    
    }
    on.exit(graphics::par(opar)) }   #

  ## meta-data
  notes <- c( qmethod="MaxQuant", normalizeMeth=normalizeMeth, call=match.call(), created=as.character(Sys.time()), 
    wrProteo.version=utils::packageVersion("wrProteo"), machine=Sys.info()["nodename"])
  ## prepare for final output
  if(separateAnnot) list(raw=abund, quant=abundN, annot=MQann, counts=counts, quantNotes=NULL, notes=notes) else data.frame(abund,MQann)
}  
    
