Skip to content

Commit

Permalink
Replace MAP_VAF & VAF_CR with MAP_HF & HF_CR and add HPC
Browse files Browse the repository at this point in the history
Rather than reporting a single statistic (or pair), report statistics for each called haplotype. This addresses the issue faced in issue #81.

TODO: update header rows once the VCF spec adds special Number "P" (see samtools/hts-specs#442).
  • Loading branch information
Daniel Cooke committed Aug 30, 2019
1 parent 0736ae8 commit 48e2b5b
Show file tree
Hide file tree
Showing 4 changed files with 115 additions and 117 deletions.
113 changes: 51 additions & 62 deletions src/core/callers/cancer_caller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -852,27 +852,37 @@ auto compute_marginal_credible_interval(const model::SomaticSubcloneModel::Prior
return maths::beta_hdi(alphas[k], a0 - alphas[k], mass);
}

auto compute_marginal_credible_intervals(const model::SomaticSubcloneModel::Priors::GenotypeMixturesDirichletAlphas& alphas,
const double mass)
struct VAFStats
{
using CredibleRegion = std::pair<double, double>;
CredibleRegion credible_region;
double map, count;
};

auto compute_vaf_stats(const model::SomaticSubcloneModel::Priors::GenotypeMixturesDirichletAlphas& alphas,
const double credible_mass)
{
const auto a0 = std::accumulate(std::cbegin(alphas), std::cend(alphas), 0.0);
std::vector<std::pair<double, double>> result {};
std::vector<VAFStats> result {};
result.reserve(alphas.size());
for (const auto& alpha : alphas) {
result.push_back(maths::beta_hdi(alpha, a0 - alpha, mass));
for (std::size_t i {0}; i < alphas.size(); ++i) {
auto map_vaf = maths::dirichlet_expectation(i, alphas);
auto vaf_cr = maths::beta_hdi(alphas[i], a0 - alphas[i], credible_mass);
result.push_back({vaf_cr, map_vaf, alphas[i]});
}
return result;
}

using CredibleRegionMap = std::unordered_map<SampleName, std::vector<std::pair<double, double>>>;
using VAFStatsVector = std::vector<VAFStats>;
using VAFStatsMap = std::unordered_map<SampleName, VAFStatsVector>;

auto compute_marginal_credible_intervals(const model::SomaticSubcloneModel::Priors::GenotypeMixturesDirichletAlphaMap& alphas,
const double mass)
auto compute_vaf_stats(const model::SomaticSubcloneModel::Priors::GenotypeMixturesDirichletAlphaMap& alphas,
const double credible_mass)
{
CredibleRegionMap result {};
VAFStatsMap result {};
result.reserve(alphas.size());
for (const auto& p : alphas) {
result.emplace(p.first, compute_marginal_credible_intervals(p.second, mass));
result.emplace(p.first, compute_vaf_stats(p.second, credible_mass));
}
return result;
}
Expand Down Expand Up @@ -905,29 +915,6 @@ auto compute_credible_somatic_mass(const model::SomaticSubcloneModel::Priors::Ge
return 1.0 - inv_result;
}

auto compute_map_somatic_vaf(const model::SomaticSubcloneModel::Priors::GenotypeMixturesDirichletAlphas& alphas,
const unsigned somatic_ploidy)
{
double result {0.0};
for (unsigned i {1}; i <= somatic_ploidy; ++i) {
result = std::max(maths::dirichlet_expectation(alphas.size() - i, alphas), result);
}
return result;
}

using SomaticVAFMap = std::unordered_map<SampleName, double>;

auto compute_map_somatic_vafs(const model::SomaticSubcloneModel::Priors::GenotypeMixturesDirichletAlphaMap& alphas,
const unsigned somatic_ploidy)
{
SomaticVAFMap result {};
result.reserve(alphas.size());
for (const auto& p : alphas) {
result.emplace(p.first, compute_map_somatic_vaf(p.second, somatic_ploidy));
}
return result;
}

struct GermlineVariantCall : Mappable<GermlineVariantCall>
{
GermlineVariantCall() = delete;
Expand Down Expand Up @@ -993,8 +980,7 @@ struct CancerGenotypeCall

CancerGenotype<Allele> genotype;
Phred<double> posterior;
CredibleRegionMap credible_regions;
SomaticVAFMap somatic_map_vafs;
VAFStatsMap vaf_stats;
};

using CancerGenotypeCalls = std::vector<CancerGenotypeCall>;
Expand Down Expand Up @@ -1198,17 +1184,15 @@ auto call_somatic_genotypes(const CancerGenotype<Haplotype>& called_genotype,
const std::vector<GenomicRegion>& called_somatic_regions,
const std::vector<CancerGenotype<Haplotype>>& genotypes,
const std::vector<double>& genotype_posteriors,
const CredibleRegionMap& credible_regions,
const SomaticVAFMap& somatic_vafs)
const VAFStatsMap& vaf_stats)
{
CancerGenotypeCalls result {};
result.reserve(called_somatic_regions.size());
for (const auto& region : called_somatic_regions) {
auto genotype_chunk = copy<Allele>(called_genotype, region);
auto posterior = marginalise(genotype_chunk, genotypes, genotype_posteriors);
result.emplace_back(std::move(genotype_chunk), posterior);
result.back().credible_regions = credible_regions;
result.back().somatic_map_vafs = somatic_vafs;
result.back().vaf_stats = vaf_stats;
}
return result;
}
Expand Down Expand Up @@ -1253,23 +1237,27 @@ auto transform_somatic_calls(SomaticVariantCalls&& somatic_calls, CancerGenotype
std::transform(std::make_move_iterator(std::begin(somatic_calls)), std::make_move_iterator(std::end(somatic_calls)),
std::make_move_iterator(std::begin(genotype_calls)), std::back_inserter(result),
[&somatic_samples] (auto&& variant_call, auto&& genotype_call) -> std::unique_ptr<octopus::VariantCall> {
std::unordered_map<SampleName, SomaticCall::GenotypeCredibleRegions> credible_regions {};
const auto germline_ploidy = genotype_call.genotype.germline_ploidy();
for (const auto& p : genotype_call.credible_regions) {
SomaticCall::GenotypeCredibleRegions sample_credible_regions {};
sample_credible_regions.germline.reserve(germline_ploidy);
std::copy(std::cbegin(p.second), std::prev(std::cend(p.second)),
std::back_inserter(sample_credible_regions.germline));
SomaticCall::GenotypeStatsMap genotype_stats {};
genotype_stats.reserve(genotype_call.vaf_stats.size()); // num samples
for (const auto& p : genotype_call.vaf_stats) {
const VAFStatsVector& stats {p.second};
SomaticCall::GenotypeAlleleStats sample_stats {};
sample_stats.germline.reserve(genotype_call.genotype.germline_ploidy());
const auto convert_stats = [] (const VAFStats& stats) -> SomaticCall::AlleleStats {
return {stats.credible_region, stats.map, stats.count};
};
std::transform(std::cbegin(stats), std::next(std::cbegin(stats), genotype_call.genotype.germline_ploidy()),
std::back_inserter(sample_stats.germline), convert_stats);
if (std::find(std::cbegin(somatic_samples), std::cend(somatic_samples), p.first) != std::cend(somatic_samples)) {
auto somatic_idx = find_index(genotype_call.genotype.somatic(), variant_call.variant.get().alt_allele());
sample_credible_regions.somatic = p.second[germline_ploidy + somatic_idx];
sample_stats.somatic.reserve(genotype_call.genotype.somatic_ploidy());
std::transform(std::next(std::cbegin(stats), genotype_call.genotype.germline_ploidy()), std::cend(stats),
std::back_inserter(sample_stats.somatic), convert_stats);
}
credible_regions.emplace(p.first, std::move(sample_credible_regions));
genotype_stats.emplace(p.first, std::move(sample_stats));
}
return std::make_unique<SomaticCall>(variant_call.variant.get(), std::move(genotype_call.genotype),
genotype_call.posterior, std::move(credible_regions),
genotype_call.somatic_map_vafs,
variant_call.segregation_quality, variant_call.posterior);
variant_call.segregation_quality, genotype_call.posterior,
std::move(genotype_stats), variant_call.posterior);
});
return result;
}
Expand Down Expand Up @@ -1357,21 +1345,23 @@ CancerCaller::call_variants(const std::vector<Variant>& candidates, const Latent
auto somatic_variant_calls = call_somatic_variants(somatic_allele_posteriors, *called_cancer_genotype,
parameters_.min_somatic_posterior);
const auto& somatic_alphas = latents.somatic_model_inferences_.max_evidence_params.alphas;
const auto credible_regions = compute_marginal_credible_intervals(somatic_alphas, parameters_.credible_mass);
const auto vaf_stats = compute_vaf_stats(somatic_alphas, parameters_.credible_mass);
if (!somatic_variant_calls.empty()) {
for (const auto& p : credible_regions) {
for (const auto& p : vaf_stats) {
const auto& sample = p.first;
const auto& sample_vaf_stats = p.second;
if (debug_log_) {
auto ss = stream(*debug_log_);
ss << p.first << " somatic credible regions: ";
for (auto cr : p.second) ss << '(' << cr.first << ' ' << cr.second << ") ";
ss << sample << " somatic credible regions: ";
for (auto stats : sample_vaf_stats) ss << '(' << stats.credible_region.first << ' ' << stats.credible_region.second << ") ";
}
if (std::any_of(std::next(std::cbegin(p.second), parameters_.ploidy), std::cend(p.second),
[this] (const auto& credible_region) { return credible_region.first >= parameters_.min_credible_somatic_frequency; })) {
if (has_normal_sample() && p.first == normal_sample()) {
if (std::any_of(std::next(std::cbegin(sample_vaf_stats), parameters_.ploidy), std::cend(sample_vaf_stats),
[this] (const auto& stats) { return stats.credible_region.first >= parameters_.min_credible_somatic_frequency; })) {
if (has_normal_sample() && sample == normal_sample()) {
somatic_samples.clear();
break;
}
somatic_samples.push_back(p.first);
somatic_samples.push_back(sample);
}
}
if (latents.noise_model_inferences_ && latents.normal_germline_inferences_) {
Expand Down Expand Up @@ -1401,11 +1391,10 @@ CancerCaller::call_variants(const std::vector<Variant>& candidates, const Latent
*debug_log_ << "Called somatic variants:";
debug::print_variants(stream(*debug_log_), somatic_variant_calls);
}
const auto somatic_vafs = compute_map_somatic_vafs(somatic_alphas, latents.somatic_ploidy_);
const auto called_somatic_regions = extract_regions(somatic_variant_calls);
auto cancer_genotype_calls = call_somatic_genotypes(*called_cancer_genotype, called_somatic_regions,
latents.cancer_genotypes_, cancer_genotype_posteriors,
credible_regions, somatic_vafs);
vaf_stats);
result = transform_somatic_calls(std::move(somatic_variant_calls), std::move(cancer_genotype_calls), somatic_samples);
} else if (debug_log_) {
stream(*debug_log_) << "Conflict between called germline genotype and called cancer genotype. Not calling somatics";
Expand Down
5 changes: 3 additions & 2 deletions src/core/tools/vcf_header_factory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ VcfHeaderFactory::AnnotatorMap VcfHeaderFactory::annotators_ =
hb.add_info("SOMATIC", "0", "Flag", "Indicates that the record is a somatic mutation, for cancer genomics");
hb.add_info("PP", "1", "Float", "Posterior probability for assertions made in ALT and FORMAT (Phred scale)");
hb.add_info("MP", "1", "Float", "Model posterior");
hb.add_format("MAP_VAF", "1", "Float", "Maximum a posteriori Variant Allele Frequency");
hb.add_format("VAF_CR", "2", "Float", "Credible region for the Variant Allele Frequency");
hb.add_format("HPC", ".", "Float", "Posterior pseudo counts for each haplotype");
hb.add_format("MAP_HF", ".", "Float", "Maximum a posteriori haplotype frequencies");
hb.add_format("HF_CR", ".", "Float", "Haplotype frequency credible regions");
}},
{std::type_index(typeid(DenovoCall)), [] (auto& hb) {
hb.add_info("DENOVO", "0", "Flag", "Indicates that the record is a de novo mutation");
Expand Down
58 changes: 43 additions & 15 deletions src/core/types/calls/somatic_call.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,29 @@

namespace octopus {

SomaticCall::SomaticCall(Variant variant,
const CancerGenotype<Allele>& genotype_call,
Phred<double> quality,
Phred<double> genotype_posterior,
GenotypeStatsMap stats,
boost::optional<Phred<double>> classification_posterior)
: VariantCall {std::move(variant), decltype(genotype_calls_) {}, quality, classification_posterior}
, genotype_stats_ {std::move(stats)}
{
if (variant_.ref_allele() == variant_.alt_allele()) {
Allele::NucleotideSequence missing_sequence(ref_sequence_size(variant_), 'N');
using octopus::mapped_region;
variant_ = Variant {
Allele {mapped_region(variant_), std::move(missing_sequence)},
variant_.alt_allele()
};
}
genotype_calls_.reserve(genotype_stats_.size()); // num samples
for (const auto& p : genotype_stats_) {
genotype_calls_.emplace(p.first, GenotypeCall {p.second.somatic.empty() ? genotype_call.germline() : demote(genotype_call), genotype_posterior});
}
}

static std::string to_string_sf(const double val, const int sf = 2)
{
return utils::strip_leading_zeroes(utils::to_string(val, sf, utils::PrecisionRule::sf));
Expand All @@ -18,22 +41,27 @@ void SomaticCall::decorate(VcfRecord::Builder& record) const
if (posterior_) {
record.set_info("PP", utils::to_string(posterior_->score()));
}
if (!map_vafs_.empty()) {
record.add_format("MAP_VAF");
}
record.add_format("VAF_CR");
for (const auto& p : credible_regions_) {
if (p.second.somatic) {
if (!map_vafs_.empty()) {
record.set_format(p.first, "MAP_VAF", to_string_sf(map_vafs_.at(p.first)));
}
record.set_format(p.first, "VAF_CR", {to_string_sf(p.second.somatic->first), to_string_sf(p.second.somatic->second)});
} else {
if (!map_vafs_.empty()) {
record.set_format_missing(p.first, "MAP_VAF");
}
record.set_format_missing(p.first, "VAF_CR");
record.add_format({"HPC", "MAP_HF", "HF_CR"});
for (const auto& p : genotype_stats_) {
std::vector<std::string> hpcs {}, map_hfs {}, hf_crs {};
const auto ploidy = this->genotype_calls_.at(p.first).genotype.ploidy();
assert(ploidy == p.second.germline.size() + p.second.somatic.size());
hpcs.reserve(ploidy);
map_hfs.reserve(ploidy);
hf_crs.reserve(2 * ploidy);
for (const auto& stats : p.second.germline) {
hpcs.push_back(to_string_sf(stats.pseudo_count));
map_hfs.push_back(to_string_sf(stats.map_vaf));
hf_crs.insert(std::cend(hf_crs), {to_string_sf(stats.vaf_credible_region.first), to_string_sf(stats.vaf_credible_region.second)});
}
for (const auto& stats : p.second.somatic) {
hpcs.push_back(to_string_sf(stats.pseudo_count));
map_hfs.push_back(to_string_sf(stats.map_vaf));
hf_crs.insert(std::cend(hf_crs), {to_string_sf(stats.vaf_credible_region.first), to_string_sf(stats.vaf_credible_region.second)});
}
record.set_format(p.first, "HPC", std::move(hpcs));
record.set_format(p.first, "MAP_HF", std::move(map_hfs));
record.set_format(p.first, "HF_CR", std::move(hf_crs));
}
}

Expand Down
56 changes: 18 additions & 38 deletions src/core/types/calls/somatic_call.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

#include "config/common.hpp"
#include "core/types/allele.hpp"
#include "core/types/variant.hpp"
#include "core/types/cancer_genotype.hpp"
#include "variant_call.hpp"

Expand All @@ -23,20 +24,27 @@ class SomaticCall : public VariantCall
using VariantCall::GenotypeCall;
using VariantCall::PhaseCall;

using CredibleRegion = std::pair<double, double>;
struct AlleleStats
{
using CredibleRegion = std::pair<double, double>;
CredibleRegion vaf_credible_region;
double map_vaf, pseudo_count;
};

struct GenotypeCredibleRegions
struct GenotypeAlleleStats
{
std::vector<CredibleRegion> germline;
boost::optional<CredibleRegion> somatic;
std::vector<AlleleStats> germline, somatic;
};

using GenotypeStatsMap = std::unordered_map<SampleName, GenotypeAlleleStats>;

SomaticCall() = delete;

template <typename V, typename C, typename M>
SomaticCall(V&& variant, const CancerGenotype<Allele>& genotype_call,
Phred<double> genotype_posterior, C&& credible_regions, M&& map_vafs,
Phred<double> quality, boost::optional<Phred<double>> posterior = boost::none);
SomaticCall(Variant variant,
const CancerGenotype<Allele>& genotype_call,
Phred<double> quality, Phred<double> genotype_posterior,
GenotypeStatsMap stats,
boost::optional<Phred<double>> classification_posterior = boost::none);

SomaticCall(const SomaticCall&) = default;
SomaticCall& operator=(const SomaticCall&) = default;
Expand All @@ -48,40 +56,12 @@ class SomaticCall : public VariantCall
virtual bool requires_model_evaluation() const noexcept override { return true; }

protected:
std::unordered_map<SampleName, GenotypeCredibleRegions> credible_regions_;
std::unordered_map<SampleName, double> map_vafs_;
GenotypeStatsMap genotype_stats_;

private:
virtual std::unique_ptr<Call> do_clone() const override;
};

template <typename V, typename C, typename M>
SomaticCall::SomaticCall(V&& variant,
const CancerGenotype<Allele>& genotype_call,
Phred<double> genotype_posterior,
C&& credible_regions, M&& map_vafs,
Phred<double> quality, boost::optional<Phred<double>> posterior)
: VariantCall {std::forward<V>(variant), decltype(genotype_calls_) {}, quality, posterior}
, credible_regions_ {std::forward<C>(credible_regions)}
, map_vafs_ {std::forward<M>(map_vafs)}
{
if (variant_.ref_allele() == variant_.alt_allele()) {
Allele::NucleotideSequence missing_sequence(ref_sequence_size(variant_), 'N');
using octopus::mapped_region;
variant_ = Variant {
Allele {mapped_region(variant_), std::move(missing_sequence)},
variant_.alt_allele()
};
}
genotype_calls_.reserve(credible_regions_.size()); // num samples
for (const auto& p : credible_regions_) {
if (p.second.somatic) {
genotype_calls_.emplace(p.first, GenotypeCall {demote(genotype_call), genotype_posterior});
} else {
genotype_calls_.emplace(p.first, GenotypeCall {genotype_call.germline(), genotype_posterior});
}
}
}

} // namespace octopus

#endif

0 comments on commit 48e2b5b

Please sign in to comment.