Other than scatter, are there alternate methods to iterate over an array in FireCloud?
I am creating several subsets of data (based on an input dataset and class vector). For each data subset, I want to run a copy number correlation analysis (that requires the use of a scatter to parallelize the calculation of millions of correlation values). I had implemented this as a 2-level scatter with a sub-workflow. I tested it out locally and it works fine in cromwell v29. But when I try to upload the workflow to FireCloud I get a "WDL imports not yet supported" error -- looks like I can't import the sub-workflow.
Any ideas on implementing this workflow are greatly appreciated. I've attached the workflow (cna_analysis_bysubgroup.wdl) and sub-workflow (cna_analysis.wdl).
cna_analysis_bysubgroup.wdl
import "cna_analysis.wdl" as cna_analysis
task harmonize_data {
File rna
File cna
File pome
String analysisDir
String codeDir = "/prot/proteomics/Projects/PGDAC/src"
String dataDir = "/prot/proteomics/Projects/PGDAC/data"
command {
set -euo pipefail
# create matrix files from input gct -- harmonize both rows and columns
/prot/proteomics/Projects/PGDAC/src/run-pipeline.sh harmonize -c ${codeDir} -d ${dataDir} -r ${analysisDir} -rna ${rna} -cna ${cna} -f ${pome}
}
output {
File outputs = "harmonize-output.tar"
}
runtime {
docker : "broadcptac/pgdac_basic:1"
}
meta {
author : "D. R. Mani"
email : "manidr@broadinstitute.org"
}
}
task cna_analysis_setup {
File tarball
Int? jidMax
File? groups # expt-design-like file to use for subgroups
String codeDir = "/prot/proteomics/Projects/PGDAC/src"
command {
set -euo pipefail
# setup directories and code (not needed to run, but for final tar file output);
# use matrix files from harmonization and create any subsets
# create table of matrix files (tsv, one line per group, in order: rna, cna, pome)
# determine subgroup list (separately, since this is a string and cannot be part of matrix files)
# determine actual jidMax and create file with list of jid's
/prot/proteomics/Projects/PGDAC/src/run-pipeline.sh CNAsetup -i ${tarball} -c ${codeDir} ${"-g " + groups} ${"-pe " + jidMax}
}
output {
Array[Array[File]] matrixFiles = read_tsv ("file_table.tsv")
Array[String] subgroups = read_lines ("subgroups.txt")
File jidsFile = "jids.txt"
File outputs = "CNAsetup-output.tar"
}
runtime {
docker : "broadcptac/pgdac_basic:1"
}
meta {
author : "D. R. Mani"
email : "manidr@broadinstitute.org"
}
}
task assemble_results {
Array[Array[File]] table_files
Array[File] plot_files
File tarball
command {
set -eu # do not use -o pipefail -- results in error 141 when using ... | head -1 | ...
# extract tarball in current directory and set $analysis_dir
tar -x -f ${tarball}
analysis_dir=`tar -t -f ${tarball} | head -1 | sed -e 's/\/.*//'`
cd $analysis_dir
# copy result tables/plots to appropriate location
# (first flatted the table_files 2D array;
# using sep=" " creates ["item1.1", ... "item1.n"] ["item2.1", ... "item2.n"] ...
# flatten by removing [ ] , "
file_list=`echo '${sep=" " table_files}' | tr -d '][,"'`
cp $file_list cna
cp ${sep=" " plot_files} cna
# recreate new tarball for output
cd ..
tar -c -f CNA-output.tar $analysis_dir
}
output {
File outputs = "CNA-output.tar"
}
runtime {
docker : "broadcptac/pgdac_basic:1"
}
meta {
author : "D. R. Mani"
email : "manidr@broadinstitute.org"
}
}
workflow run_cna_analysis_on_subgroups {
File rna
File cna
File pome
String analysisDir
call harmonize_data {
input:
rna=rna,
cna=cna,
pome=pome,
analysisDir=analysisDir
}
call cna_analysis_setup {
input:
tarball=harmonize_data.outputs
}
scatter (idx in range (length (cna_analysis_setup.subgroups))) {
call cna_analysis.run_cna_analysis as cna_s {
input:
prefix=cna_analysis_setup.subgroups[idx],
rna=cna_analysis_setup.matrixFiles[idx][0],
cna=cna_analysis_setup.matrixFiles[idx][1],
pome=cna_analysis_setup.matrixFiles[idx][2],
jidsFile=cna_analysis_setup.jidsFile
}
}
call assemble_results {
input:
table_files=cna_s.tables,
plot_files=cna_s.plot,
tarball=cna_analysis_setup.outputs
}
output {
File final_output = assemble_results.outputs
}
}
cna_analysis.wdl
task cna_analysis {
File rna
File cna
File pome
String prefix
Int jidMax
Int jid
String codeDir = "/prot/proteomics/Projects/PGDAC/src"
command {
set -euo pipefail
# setup directories and code
cp ${codeDir}/cna-analysis.r ${codeDir}/generate-cna-plots.r .
if [ ! -d ${prefix}-output ]; then
mkdir ${prefix}-output
fi
# run cna analysis for corresponding shard / gather
Rscript cna-analysis.r ${jid} ${jidMax} ${prefix} ${rna} ${cna} ${pome}
}
output {
File rna_cna_corr = "${prefix}-output/mrna-vs-cna-corr${jid}.csv"
File rna_cna_pval = "${prefix}-output/mrna-vs-cna-pval${jid}.csv"
File pome_cna_corr = "${prefix}-output/pome-vs-cna-corr${jid}.csv"
File pome_cna_pval = "${prefix}-output/pome-vs-cna-pval${jid}.csv"
}
runtime {
docker : "broadcptac/pgdac_basic:1"
}
meta {
author : "D. R. Mani"
email : "manidr@broadinstitute.org"
}
}
task gather_results_and_plot {
String prefix
Int jidMax
Array[File] rna_vs_cna_corr
Array[File] rna_vs_cna_pval
Array[File] pome_vs_cna_corr
Array[File] pome_vs_cna_pval
String codeDir = "/prot/proteomics/Projects/PGDAC/src"
String dataDir = "/prot/proteomics/Projects/PGDAC/data"
command {
set -euo pipefail
# setup directories and code
cp ${codeDir}/cna-analysis.r ${codeDir}/generate-cna-plots.r .
cp ${dataDir}/chr-length.csv ${dataDir}/gene-location.csv .
if [ ! -d ${prefix}-output ]; then
mkdir ${prefix}-output
fi
# copy results from scatter operation
mv ${sep=" " rna_vs_cna_corr} ${prefix}-output
mv ${sep=" " rna_vs_cna_pval} ${prefix}-output
mv ${sep=" " pome_vs_cna_corr} ${prefix}-output
mv ${sep=" " pome_vs_cna_pval} ${prefix}-output
# run cna analysis for corresponding shard / gather
Rscript cna-analysis.r 0 ${jidMax} ${prefix} NULL NULL NULL
}
output {
Array[File] tables=glob ("${prefix}-*-vs-*.csv")
File plot="${prefix}-cna-plot.png"
}
runtime {
docker : "broadcptac/pgdac_basic:1"
}
meta {
author : "D. R. Mani"
email : "manidr@broadinstitute.org"
}
}
workflow run_cna_analysis {
File rna
File cna
File pome
String prefix
File jidsFile
Array[Int] jids = read_lines ("${jidsFile}")
Int jidMax = length (jids)
scatter (i in jids) {
call cna_analysis {
input:
rna=rna,
cna=cna,
pome=pome,
prefix=prefix,
jidMax=jidMax,
jid=i
}
}
call gather_results_and_plot {
input:
prefix=prefix,
jidMax=jidMax,
rna_vs_cna_corr=cna_analysis.rna_cna_corr,
rna_vs_cna_pval=cna_analysis.rna_cna_pval,
pome_vs_cna_corr=cna_analysis.pome_cna_corr,
pome_vs_cna_pval=cna_analysis.pome_cna_pval
}
output {
Array[File] tables = gather_results_and_plot.tables
File plot = gather_results_and_plot.plot
}
}