#!/bin/env Rscript ## ## bigFile2trackHubViews.R ## ## R script to take bigWig/bigBed files as input and create the ## config code necessary for UCSC genome browser tracks grouped ## together in Views. Coverage tracks are in a coverage view, ## and peaks are in a peak view. Group annotations are added ## for filtering on the configuration page in the UCSC browser. ## environmental variables are used to send parameters: # basePriority # trackSetName # trackSet # windowingFunction # ## Example of running this script with some parameters being set: # basePriority=200 trackSetName=exampleTrackSet windowingFunction="mean+whiskers" ./bigFile2trackHubViews.R ## Initial R setup, load CBioRUtils.R from within the Bpipe path BpipeBase <- Sys.getenv("BPIPEBASE"); if (BpipeBase == "") { BpipeBase <- "/ddn/gs1/home/wardjm/Builds/NIEHS_IntegrativeBIx-svn/Bpipe"; #stop("Error: The environmental variable BPIPEBASE was not set."); } cbioPath <- file.path(BpipeBase, "bpipeScripts", "CBioRUtils.R"); if (file.exists(cbioPath)) { CBioRUtils <- new.env(parent=.GlobalEnv); sys.source(cbioPath, envir=CBioRUtils, keep.source=TRUE); assign("CBioRUtils", CBioRUtils, envir=.GlobalEnv); attach(CBioRUtils, warn=FALSE, pos=1); } ## Read arguments from environment variables ## e.g. run the command like this: ## basePriority=1100 ./bigFile2trackHubViews.R cmdArgs <- commandArgs(trailingOnly=TRUE); argsNamed <- getArgs(); printDebug("cmdArgs:", cmdArgs, c("orange", "lightblue")); VERBOSE <- getEnvParam("VERBOSE", "boolean", FALSE); USETRACKNAME <- getEnvParam("USETRACKNAME", "boolean", FALSE); ## Determine infiles ## junction files are bigBed files with "junc" somewhere in the filename ## peak files are bigBed files with no "junc" in the filename argFiles <- cmdArgs[file.exists(cmdArgs)]; if (VERBOSE) { printDebug("argFiles:", argFiles, c("orange", "lightblue")); } if (length(argFiles) == 0) { ## Fallback approach is to take all files in this directory argFiles <- list.files(pattern="[.](bw|bigWig|bb|bigBed)$"); } bwFiles <- vigrep(".bw$|.bigWig$", argFiles); peakFiles <- unvigrep("junc", vigrep(".bb$|.bigBed$", argFiles)); juncFiles <- vigrep("junc", vigrep(".bb$|.bigBed$", argFiles)); ## Re-classify files as junction files if we also have combinedJunctions files if (igrepHas("combinedJunctions", juncFiles) && length(peakFiles) > 1) { juncFiles <- c(juncFiles, peakFiles); peakFiles <- character(0); } if (length(bwFiles) == 0 && length(peakFiles) == 0 && length(juncFiles) == 0) { printDebug("Error: No input files supplied nor found in the current working directory: ", getwd(), c("orange", "lightblue")); stop("Input files not found."); } allFilesDF <- data.frame(check.names=FALSE, stringsAsFactors=FALSE, sampleID=c(bwFiles, peakFiles, juncFiles), file=c(bwFiles, peakFiles, juncFiles), type=rep(c("coverage", "peaks", "junctions"), slength(list(bwFiles, peakFiles, juncFiles)))); if (VERBOSE) { ch(allFilesDF, maxRows=40); } ## 04jun2015: Specific sampleID editing to handle the output file format for homer ChIPseq peak calls ## allFilesDF[,"sampleID"] <- gsub("^(homer|.+)Comparison_(.+)-Vs-.*$", "\\2", allFilesDF[,"sampleID"]); ## 04jun2015: Moved the logic of splitting sampleID_restofffile.ext to this step instead of the data.frame() above, ## for better granularity of processing. ## Also changed from splitting at the first underscore _ to splitting at only the delimiter used for ## read number or single-end read, for example these 4 patterns: _1. _2. _S. and _s. ## This change should allow sampleIDs which have their own underscore _ character already if necessary. allFilesDF[,"sampleID"] <- gsub("(_[12sS][.]|.(STAR|TopHat2)_|[_.]union).+", "", allFilesDF[,"sampleID"]); ## Pre-set some parameters, in future may become user-defined # basePriority <- getEnvParam("basePriority", "integer", 1100); #forceSave <- getEnvParam("forceSave", "boolean", FALSE); #doTrackSetParent <- getEnvParam("doTrackSetParent", "boolean", TRUE); trackSetName <- getEnvParam("trackSetName", "character", "Working Tracks"); trackSet <- getEnvParam("trackSet", "character", gsub("[ ]+", "", trackSetName)); if (igrepHas(" ", trackSet)) { trackSet <- gsub("[ ]+", "", trackSet); printDebug("Removed spaces from trackSet: ", trackSet, c("orange", "lightgreen")); } #appendTrackSet <- getEnvParam("appendTrackSet", "boolean", FALSE); # windowingFunction - defines the UCSC track parameter for the type of histogram # e.g. mean, maximum, mean+whiskers windowingFunction <- getEnvParam("windowingFunction", "character", "mean+whiskers"); ## Read sampleGroups.txt file sampleGroupsFile <- getEnvParam("sampleGroupsFile", "character", "sampleGroups.txt"); if (!file.exists(sampleGroupsFile)) { stop(paste("The sampleGroups file does not exist:", sampleGroupsFile)); } sampleGroups <- readTable(sampleGroupsFile); if (!igrepHas("sample", colnames(sampleGroups)[1])) { sampleGroups <- rbind(colnames(sampleGroups), sampleGroups); colnames(sampleGroups) <- c("sampleID", paste("groupFactor", 1:(ncol(sampleGroups)-1), sep="_")); } ## Determine group factors and use them to generate labels and potentially colors groupCols <- vigrep("^group", colnames(sampleGroups)); sampleCol <- head(colnames(sampleGroups), 1); ## when colors are created, groupCols1 defines the primary color, groupCols2 defines the shading of that color if (length(groupCols) == 1) { ## One group column, so just color each group, but make each replicate a little different groupCols1 <- groupCols; groupCols2 <- sampleCol; } else { ## Multiple group columns, use the last one groupCols1 <- head(groupCols, -1); groupCols2 <- tail(groupCols, 1); } sampleNames <- nameVector(pasteByRow(sampleGroups[,groupCols,drop=FALSE], sep="_"), sampleGroups[,sampleCol]); sampleGroupNames <- unique(sampleNames); sampleGroups[,"sampleGroupNames"] <- sampleNames; sampleGroups[,"sampleNames"] <- sampleNames; if (any(table(sampleNames) > 1)) { sampleNames <- paste(pasteByRow(sampleGroups[,groupCols], sep="_"), sampleGroups[,sampleCol]); sampleGroups[,"sampleNames"] <- sampleNames; sampleGroups1 <- sampleGroups[match(sampleGroupNames, sampleGroups[,"sampleGroupNames"]),]; sampleGroups1[,sampleCol] <- sampleGroupNames; sampleGroups1[,"sampleNames"] <- sampleGroupNames; sampleGroups <- rbind(sampleGroups, sampleGroups1); sampleNames <- nameVector(sampleGroups[,c("sampleNames", sampleCol)]); } #groupNames <- nameVector(paste(sampleGroups[,sampleCol], # pasteByRow(sampleGroups[,groupCols], sep="_")), # sampleGroups[,sampleCol]); allFilesDF[,"sampleName"] <- sampleNames[allFilesDF[,"sampleID"]]; ## Check if there are colors defined in sampleGroups groupColors <- NULL; if (igrepHas("color", colnames(sampleGroups))) { colorCol <- head(vigrep("color", colnames(sampleGroups)), 1); groupColors <- tryCatch({ if (igrepHas(",", sampleGroups[1,colorCol])) { groupColorsRgb <- sampleGroups[,colorCol]; groupColors <- rgb2col(do.call(rbind, lapply(strsplit(groupColorsRgb, ","), as.numeric))); } else { groupColors <- rgb2col(col2rgb(sampleGroups[,colorCol])); groupColorsRgb <- pasteByRow(t(col2rgb(groupColors)), sep=","); } names(groupColors) <- sampleGroups[,sampleCol]; #names(groupColorsRgb) <- sampleNames; groupColors; }, error=function(e){ printDebug("Error: Colors in the sampleGroups file were not properly converted for use in R."); printDebug("sampleGroups[,colorCol]: ", head(sampleGroups[,colorCol], 10), c("orange", "lightblue")); printError(callName="rgb2col(col2rgb(sampleGroups[,colorCol]))", e=e); NULL; }); groupColorsRgb <- pasteByRow(t(col2rgb(groupColors)), sep=","); names(groupColorsRgb) <- sampleGroups[,sampleCol]; } ## If for some reason the steps above failed, or colors were not supplied, use our own method if (is.null(groupColors)) { groupColors <- subgroup2colors(reverseGradient=TRUE, gradientWtFactor=1/3, groupLabels=pasteByRow(sampleGroups[,groupCols1,drop=FALSE]), subgroupLabels=sampleGroups[,groupCols2]); names(groupColors) <- sampleGroups[,sampleCol]; groupColorsRgb <- pasteByRow(t(col2rgb(groupColors)), sep=","); names(groupColorsRgb) <- sampleGroups[,sampleCol]; } #showColramp(groupColors); ## Update the file-based table with sample annotations, colors, sort, priority, etc. allFilesDF[,"color"] <- groupColors[allFilesDF[,"sampleID"]]; allFilesDF[,"rgb"] <- groupColorsRgb[allFilesDF[,"sampleID"]]; allFilesDF[,"sampleGroupsOrder"] <- match(allFilesDF[,"sampleID"], sampleGroups[,sampleCol]); allFilesDF <- allFilesDF[order(allFilesDF[,"type"], allFilesDF[,"sampleGroupsOrder"]),]; allFilesDF[,"priorityNum"] <- seq_len(nrow(allFilesDF))*2 + basePriority; allFilesDF[,"trackName"] <- makeNames(pasteByRow(allFilesDF[,c("sampleID", "type")], sep="_")); ## Special track name for ChIPseq peaks which contain format "group1-Vs-group2" whichPeakRename <- igrep(".+-Vs-.+", allFilesDF[,"file"]); if (length(whichPeakRename) > 0) { if (VERBOSE) { printDebug("Applying track renaming to ChIPseq peaks: ", allFilesDF[whichPeakRename,"trackName"], c("orange", "lightblue")); } #allFilesDF[whichPeakRename,"trackName"] <- camelCase(gsub("^(.+)-Vs-(.+)[.](findPeaks|sorted|bb).*$", "\\1 vs \\2", # gsub("[.]sorted[.]bb$", ".bb", gsub("^(homer|.+)Comparison_([^.]+).", "\\2_\\1.", allFilesDF[whichPeakRename,"file"]))), sepMultiple=NULL); allFilesDF[whichPeakRename,"trackName"] <- camelCase( gsub("findPeaks(|_none)[-_]*", "", gsub("^(homer|.+)Comparison_([^-.]+)($|[-.])", "\\2_\\1\\3", gsub("^(.+)-Vs-(.+)[.](findPeaks|sorted|bb).*$", "\\1 vs \\2", gsub("[.]sorted[.]bb$", ".bb", allFilesDF[whichPeakRename,"file"])))), sepMultiple=NULL); #allFilesDF[whichPeakRename,"trackName"] <- camelCase(gsub("^(.+)-Vs-([^.]+)[.].+$", "\\1 vs \\2", # gsub("^(homer|.+)Comparison_([^.]+).", "\\2_\\1.", allFilesDF[whichPeakRename,"file"])), sepMultiple=NULL); } ch(allFilesDF, maxRows=40, bgCol="blue4", rows=2); ############################### ## For now we will ignore positive/negative strand tracks being put into multiWig tracks ############################### ## Define some template config blocks to be used in the trackDb file bwParentTemplate <- " #################################### ## track set ## ${trackSetName} track ${trackSet} ${compositeSuperTrack} on shortLabel ${trackSetName} longLabel ${trackSetName} type bigWig allButtonPair on centerLabelsDense on dragAndDrop on priority ${basePriority} alwaysZero on graphTypeDefault bar maxHeightPixels 100:60:8 smoothingWindow 2 transformFunc NONE windowingFunction ${windowingFunction} gridDefault on autoScale on viewLimits 0:50 visibility full "; bwSuperTrackTemplate <- " #################################### ## track set ## ${trackSetName} track ${trackSet} ${compositeSuperTrack} on show shortLabel ${trackSetName} longLabel ${trackSetName} priority ${basePriority} "; bwTemplate <- " # ################################## # bigWig coverage track # ${bwTrack} track ${bwTrack}${bwTrackParent} bigDataUrl ${bwFile} shortLabel ${bwFileName} coverage longLabel ${bwFileName} coverage type bigWig color ${bwColor} priority ${bwPriority} alwaysZero on graphTypeDefault bar maxHeightPixels 100:60:8 smoothingWindow 2 transformFunc NONE windowingFunction ${windowingFunction} gridDefault on autoScale on viewLimits 0:50 visibility full "; bwStrandedParentTemplate <- " # ################################## # # Overlay multiple stranded tracks track ${bwStrandedTrack}${bwTrackParent} container multiWig aggregate transparentOverlay shortLabel ${bwFileName} stranded coverage longLabel ${bwFileName} stranded coverage showSubtrackColorOnUi on type bigWig priority ${bwPriority} alwaysZero off graphTypeDefault bar maxHeightPixels 100:66:8 smoothingWindow 2 transformFunc NONE windowingFunction ${windowingFunction} gridDefault on autoScale on viewLimits -50:50 visibility full "; bwStrandedTemplatePos <- " # ################################## track ${bwTrackPos} parent ${bwStrandedTrack} bigDataUrl ${bwFilePos} type bigWig color ${bwColorPos} priority ${bwPriorityPos} "; bwStrandedTemplateNeg <- " # ################################## track ${bwTrackNeg} parent ${bwStrandedTrack} bigDataUrl ${bwFileNeg} type bigWig color ${bwColorNeg} priority ${bwPriorityNeg} "; ## Templates intended to use Views ## Dr. Kevin Trotter MiSeq ChIPseq data bwViewsParent <- " ## Beginning of track view track ${trackSet} compositeTrack on shortLabel ${trackSetName} longLabel ${trackSetName} priority ${basePriority} visibility full type bed 3 configurable on subGroup1 view Views \\ COV=Coverage \\ PEAK=Peaks \\ JUNC=Junctions ${subGroups}${groupDim}${groupSortText} "; bwViewsCoverage <- " ## Views track coverage data track ${trackSet}View1 parent ${trackSet} shortLabel Coverage view COV visibility full type bigWig allButtonPair on centerLabelsDense on dragAndDrop on alwaysZero on graphTypeDefault bar maxHeightPixels 100:60:8 smoothingWindow 2 transformFunc NONE windowingFunction ${windowingFunction} gridDefault on viewLimits 0:20 viewUi on "; bwViewsCoverageTrack <- " ## Coverage track track ${trackName} parent ${trackSet}View1 on bigDataUrl ${bwFile} shortLabel ${bwFileName} coverage longLabel ${bwFileName} coverage type bigWig color ${bwColor} priority ${bwPriority} alwaysZero on graphTypeDefault bar maxHeightPixels 100:60:8 smoothingWindow 2 transformFunc NONE windowingFunction ${windowingFunction} gridDefault on viewLimits 0:20 visibility hide subGroups ${groupLevel} "; bwViewsPeak <- " ## Views track for ChIPseq peaks track ${trackSet}View3 parent ${trackSet} shortLabel Peaks view PEAK visibility pack type bigBed allButtonPair on centerLabelsDense on dragAndDrop on viewUi on "; bwViewsPeakTrack <- " # Peak track track ${trackName} parent ${trackSet}View3 on bigDataUrl ${bwFile} shortLabel ${bwFileName} peaks longLabel ${bwFileName} peaks type bigBed visibility pack priority ${bwPriority} color ${bwColor} subGroups ${groupLevel} "; bwViewsJunction <- " ## Views track for RNAseq, junctions data track ${trackSet}View2 parent ${trackSet} shortLabel RNAseq_junctions view JUNC visibility pack type bigBed 12 allButtonPair on centerLabelsDense on dragAndDrop on thickDrawItem on scoreFilter 20 scoreFilterLimits 1:1000 viewUi on "; bwViewsJunctionTrack <- " # combined junctions track track ${trackName} parent ${trackSet}View2 on bigDataUrl ${bwFile} shortLabel ${bwFileName} junctions longLabel ${bwFileName} junctions type bigBed 12 visibility pack priority ${bwPriority} scoreMin 10 scoreMax 100 color ${bwColor} subGroups ${groupLevel} "; ## bwViewsParent ## bwViewsCoverage ## bwViewsCoverageTrack ## bwViewsCoverageTrack ## bwViewsPeak ## bwViewsPeakTrack ## bwViewsPeakTrack ## bwViewsJunction ## bwViewsJunctionTrack ## bwViewsJunctionTrack ## Define group1levels and group2levels subGroups <- ""; sortText <- ""; groupNum <- 1; for (groupCol in groupCols) { groupLevels1 <- unique(sampleGroups[,groupCol]); groupLevels <- paste(" ", paste(groupLevels1, groupLevels1, sep="="), collapse=" \\\\\n"); subGroupText <- paste0("subGroup", groupNum+1, " ", "Group", groupNum, " Group", groupNum, " \\\\ ", groupLevels, " "); subGroups <- c(subGroups, subGroupText); sortText <- paste0(sortText, paste0("Group", groupNum, "=+ ")) groupNum <- groupNum + 1; } if (length(groupCols) > 1) { groupDim <- "dimensions dimX=Group1 dimY=Group2 "; } else { groupDim <- ""; } subGroups <- paste0(subGroups, collapse=""); groupSortText <- paste0("sortOrder ", sortText, " view=+"); ## Iterate through allFilesDF allConfigLines <- c(""); allConfigLines <- c(allConfigLines, doTemplateSubstitutions(bwViewsParent)); #cat(allConfigLines); ## Coverage tracks if (igrepHas("coverage", allFilesDF[,"type"])) { printDebug("Adding Coverage track view section."); allConfigLines <- c(allConfigLines, doTemplateSubstitutions(bwViewsCoverage)); #cat(allConfigLines); bwRows <- igrep("coverage", allFilesDF[,"type"]); for (i in bwRows) { printDebug("Adding Coverage track:", allFilesDF[i,], c("orange", "lightblue")); bwFile <- allFilesDF[i,"file"]; bwFileName <- allFilesDF[i,"sampleName"]; bwColor <- allFilesDF[i,"rgb"]; bwPriority <- allFilesDF[i,"priorityNum"]; trackName <- allFilesDF[i,"trackName"]; bwSample <- allFilesDF[i,"sampleID"]; ## Figure out group annotation for this track sampleGroupsRow <- match(bwSample, sampleGroups[,sampleCol]); groupNum <- 1; groupLevel <- ""; for (groupCol in groupCols) { groupLevel <- paste0(groupLevel, paste0("Group", groupNum, "=", sampleGroups[sampleGroupsRow,groupCol]), " "); groupNum <- groupNum + 1; } groupLevel <- paste0(groupLevel, "view=COV"); #printDebug("doing doTemplateSubstitutions(bwViewsCoverageTrack)"); allConfigLines <- c(allConfigLines, doTemplateSubstitutions(bwViewsCoverageTrack)); #cat(allConfigLines1); } } ## Junction tracks if (igrepHas("junction", allFilesDF[,"type"])) { printDebug("Adding Junction track view section."); allConfigLines <- c(allConfigLines, doTemplateSubstitutions(bwViewsJunction)); bwRows <- igrep("junction", allFilesDF[,"type"]); for (i in bwRows) { bwFile <- allFilesDF[i,"file"]; bwFileName <- allFilesDF[i,"sampleName"]; bwColor <- allFilesDF[i,"rgb"]; bwPriority <- allFilesDF[i,"priorityNum"]; trackName <- allFilesDF[i,"trackName"]; bwSample <- allFilesDF[i,"sampleID"]; ## Figure out group annotation for this track sampleGroupsRow <- match(bwSample, sampleGroups[,sampleCol]); groupNum <- 1; groupLevel <- ""; for (groupCol in groupCols) { groupLevel <- paste0(groupLevel, paste0("Group", groupNum, "=", sampleGroups[sampleGroupsRow,groupCol]), " "); groupNum <- groupNum + 1; } groupLevel <- paste0(groupLevel, "view=JUNC"); allConfigLines <- c(allConfigLines, doTemplateSubstitutions(bwViewsJunctionTrack)); #cat(allConfigLines1); } } ## Peak tracks if (igrepHas("peak", allFilesDF[,"type"])) { printDebug("Adding Peak track view section."); allConfigLines <- c(allConfigLines, doTemplateSubstitutions(bwViewsPeak)); bwRows <- igrep("peak", allFilesDF[,"type"]); for (i in bwRows) { bwFile <- allFilesDF[i,"file"]; if (USETRACKNAME) { bwFileName <- allFilesDF[i,"trackName"]; } else { bwFileName <- allFilesDF[i,"sampleName"]; } bwColor <- allFilesDF[i,"rgb"]; bwPriority <- allFilesDF[i,"priorityNum"]; trackName <- allFilesDF[i,"trackName"]; bwSample <- allFilesDF[i,"sampleID"]; ## Figure out group annotation for this track sampleGroupsRow <- match(bwSample, sampleGroups[,sampleCol]); groupNum <- 1; groupLevel <- ""; for (groupCol in groupCols) { groupLevel <- paste0(groupLevel, paste0("Group", groupNum, "=", sampleGroups[sampleGroupsRow,groupCol]), " "); groupNum <- groupNum + 1; } groupLevel <- paste0(groupLevel, "view=PEAK"); allConfigLines <- c(allConfigLines, doTemplateSubstitutions(bwViewsPeakTrack)); #cat(allConfigLines1); } } ## Now write out the full configuration file iterNum <- 1; outfileBase <- paste("trackDb_views_", trackSet, "-", nrow(allFilesDF), "files-v", iterNum, "_", getDate(), sep=""); outfileTxt <- paste(outfileBase, ".txt", sep=""); while(file.exists(outfileTxt)) { iterNum <- iterNum + 1; outfileBase <- paste("trackDb_views_", trackSet, "-", nrow(allFilesDF), "files-v", iterNum, "_", getDate(), sep=""); outfileTxt <- paste(outfileBase, ".txt", sep=""); } cat(file=outfileTxt, append=FALSE, c(allConfigLines, "\n"));