Quantcast
Channel: Ask the FireCloud Team — GATK-Forum
Viewing all articles
Browse latest Browse all 1147

Iteration in FireCloud

$
0
0

Other than scatter, are there alternate methods to iterate over an array in FireCloud?

I am creating several subsets of data (based on an input dataset and class vector). For each data subset, I want to run a copy number correlation analysis (that requires the use of a scatter to parallelize the calculation of millions of correlation values). I had implemented this as a 2-level scatter with a sub-workflow. I tested it out locally and it works fine in cromwell v29. But when I try to upload the workflow to FireCloud I get a "WDL imports not yet supported" error -- looks like I can't import the sub-workflow.

Any ideas on implementing this workflow are greatly appreciated. I've attached the workflow (cna_analysis_bysubgroup.wdl) and sub-workflow (cna_analysis.wdl).

cna_analysis_bysubgroup.wdl

import "cna_analysis.wdl" as cna_analysis


task harmonize_data {
  File rna
  File cna
  File pome
  String analysisDir
  String codeDir = "/prot/proteomics/Projects/PGDAC/src"
  String dataDir = "/prot/proteomics/Projects/PGDAC/data"

  command {
    set -euo pipefail
    # create matrix files from input gct -- harmonize both rows and columns
    /prot/proteomics/Projects/PGDAC/src/run-pipeline.sh harmonize -c ${codeDir} -d ${dataDir} -r ${analysisDir} -rna ${rna} -cna ${cna} -f ${pome}
  }

  output {
    File outputs = "harmonize-output.tar"
  }

  runtime {
    docker : "broadcptac/pgdac_basic:1"
  }

  meta {
    author : "D. R. Mani"
    email : "manidr@broadinstitute.org"
  }
}



task cna_analysis_setup {
  File tarball
  Int? jidMax
  File? groups    # expt-design-like file to use for subgroups
  String codeDir = "/prot/proteomics/Projects/PGDAC/src"

  command {
    set -euo pipefail
    # setup directories and code (not needed to run, but for final tar file output);
    # use matrix files from harmonization and create any subsets
    # create table of matrix files (tsv, one line per group, in order: rna, cna, pome)
    # determine subgroup list (separately, since this is a string and cannot be part of matrix files)
    # determine actual jidMax and create file with list of jid's
    /prot/proteomics/Projects/PGDAC/src/run-pipeline.sh CNAsetup -i ${tarball} -c ${codeDir} ${"-g " + groups} ${"-pe " + jidMax}
  }

  output {
    Array[Array[File]] matrixFiles = read_tsv ("file_table.tsv")
    Array[String] subgroups = read_lines ("subgroups.txt")
    File jidsFile = "jids.txt"
    File outputs = "CNAsetup-output.tar"
  }

  runtime {
    docker : "broadcptac/pgdac_basic:1"
  }

  meta {
    author : "D. R. Mani"
    email : "manidr@broadinstitute.org"
  }
}



task assemble_results {
  Array[Array[File]] table_files
  Array[File] plot_files
  File tarball

  command {
    set -eu   # do not use -o pipefail -- results in error 141 when using ... | head -1 | ...
    # extract tarball in current directory and set $analysis_dir
    tar -x -f ${tarball}
    analysis_dir=`tar -t -f ${tarball} | head -1 | sed -e 's/\/.*//'`
    cd $analysis_dir
    # copy result tables/plots to appropriate location
    # (first flatted the table_files 2D array;
    #  using sep=" " creates ["item1.1", ... "item1.n"] ["item2.1", ... "item2.n"] ...
    #  flatten by removing [ ] , "
    file_list=`echo '${sep=" " table_files}' | tr -d '][,"'`
    cp $file_list cna
    cp ${sep=" " plot_files} cna
    # recreate new tarball for output
    cd ..
    tar -c -f CNA-output.tar $analysis_dir
  }

  output {
    File outputs = "CNA-output.tar"
  }

  runtime {
    docker : "broadcptac/pgdac_basic:1"
  }

  meta {
    author : "D. R. Mani"
    email : "manidr@broadinstitute.org"
  }
}




workflow run_cna_analysis_on_subgroups {
  File rna
  File cna
  File pome
  String analysisDir

  call harmonize_data {
    input:
      rna=rna,
      cna=cna,
      pome=pome,
      analysisDir=analysisDir
  }

  call cna_analysis_setup {
    input:
       tarball=harmonize_data.outputs
  }

  scatter (idx in range (length (cna_analysis_setup.subgroups))) {
    call cna_analysis.run_cna_analysis as cna_s {
      input:
        prefix=cna_analysis_setup.subgroups[idx],
        rna=cna_analysis_setup.matrixFiles[idx][0],
        cna=cna_analysis_setup.matrixFiles[idx][1],
        pome=cna_analysis_setup.matrixFiles[idx][2],
        jidsFile=cna_analysis_setup.jidsFile
    }
  }

  call assemble_results {
    input:
      table_files=cna_s.tables,
      plot_files=cna_s.plot,
      tarball=cna_analysis_setup.outputs
  }

  output {
    File final_output = assemble_results.outputs
  }
}

cna_analysis.wdl

task cna_analysis {
  File rna
  File cna
  File pome
  String prefix
  Int jidMax
  Int jid
  String codeDir = "/prot/proteomics/Projects/PGDAC/src"

  command {
    set -euo pipefail
    # setup directories and code
    cp ${codeDir}/cna-analysis.r ${codeDir}/generate-cna-plots.r .
    if [ ! -d ${prefix}-output ]; then
      mkdir ${prefix}-output
    fi
    # run cna analysis for corresponding shard / gather
    Rscript cna-analysis.r ${jid} ${jidMax} ${prefix} ${rna} ${cna} ${pome}
  }

  output {
    File rna_cna_corr = "${prefix}-output/mrna-vs-cna-corr${jid}.csv"
    File rna_cna_pval = "${prefix}-output/mrna-vs-cna-pval${jid}.csv"
    File pome_cna_corr = "${prefix}-output/pome-vs-cna-corr${jid}.csv"
    File pome_cna_pval = "${prefix}-output/pome-vs-cna-pval${jid}.csv"
  }

  runtime {
    docker : "broadcptac/pgdac_basic:1"
  }

  meta {
    author : "D. R. Mani"
    email : "manidr@broadinstitute.org"
  }
}



task gather_results_and_plot {
  String prefix
  Int jidMax
  Array[File] rna_vs_cna_corr
  Array[File] rna_vs_cna_pval
  Array[File] pome_vs_cna_corr
  Array[File] pome_vs_cna_pval
  String codeDir = "/prot/proteomics/Projects/PGDAC/src"
  String dataDir = "/prot/proteomics/Projects/PGDAC/data"


  command {
    set -euo pipefail
    # setup directories and code
    cp ${codeDir}/cna-analysis.r ${codeDir}/generate-cna-plots.r .
    cp ${dataDir}/chr-length.csv ${dataDir}/gene-location.csv .
    if [ ! -d ${prefix}-output ]; then
      mkdir ${prefix}-output
    fi
    # copy results from scatter operation
    mv ${sep=" " rna_vs_cna_corr} ${prefix}-output
    mv ${sep=" " rna_vs_cna_pval} ${prefix}-output
    mv ${sep=" " pome_vs_cna_corr} ${prefix}-output
    mv ${sep=" " pome_vs_cna_pval} ${prefix}-output
    # run cna analysis for corresponding shard / gather
    Rscript cna-analysis.r 0 ${jidMax} ${prefix} NULL NULL NULL
  }

  output {
    Array[File] tables=glob ("${prefix}-*-vs-*.csv")
    File plot="${prefix}-cna-plot.png"
  }

  runtime {
    docker : "broadcptac/pgdac_basic:1"
  }

  meta {
    author : "D. R. Mani"
    email : "manidr@broadinstitute.org"
  }
}




workflow run_cna_analysis {
  File rna
  File cna
  File pome
  String prefix
  File jidsFile
  Array[Int] jids = read_lines ("${jidsFile}")
  Int jidMax = length (jids)


  scatter (i in jids) {
    call cna_analysis {
      input:
        rna=rna,
        cna=cna,
        pome=pome,
        prefix=prefix,
        jidMax=jidMax,
        jid=i
    }
  }

  call gather_results_and_plot {
    input:
      prefix=prefix,
      jidMax=jidMax,
      rna_vs_cna_corr=cna_analysis.rna_cna_corr,
      rna_vs_cna_pval=cna_analysis.rna_cna_pval,
      pome_vs_cna_corr=cna_analysis.pome_cna_corr,
      pome_vs_cna_pval=cna_analysis.pome_cna_pval
  }

  output {
    Array[File] tables = gather_results_and_plot.tables
    File plot = gather_results_and_plot.plot
  }
}

Viewing all articles
Browse latest Browse all 1147

Trending Articles