Skip to content

Commit

Permalink
More robust handling of info END fields (#672)
Browse files Browse the repository at this point in the history
  • Loading branch information
gspowley committed Mar 18, 2024
1 parent 1712c9b commit 28c5e5d
Show file tree
Hide file tree
Showing 6 changed files with 213 additions and 21 deletions.
186 changes: 186 additions & 0 deletions apis/python/tests/test_tiledbvcf.py
Expand Up @@ -1609,3 +1609,189 @@ def test_bed_array(tmp_path, test_ds_v4):
expected_df,
df.sort_values(ignore_index=True, by=["sample_name", "pos_start"]),
)


def test_info_end(tmp_path):
"""
This test checks that the info_END attribute is handled correctly, even when the
VCF header incorrectly defines the END attribute as a string.
The test also checks that info_END contains the original values from the VCF,
including the missing values.
"""

expected_end = pd.DataFrame(
{
"pos_end": pd.Series(
[
12277,
12771,
13374,
13395,
13413,
13451,
13519,
13544,
13689,
17479,
17486,
30553,
35224,
35531,
35786,
69096,
69103,
69104,
69109,
69110,
69111,
69112,
69114,
69115,
69122,
69123,
69128,
69129,
69130,
69192,
69195,
69196,
69215,
69222,
69227,
69228,
69261,
69262,
69269,
69270,
69346,
69349,
69352,
69353,
69370,
69510,
69511,
69760,
69761,
69770,
69834,
69835,
69838,
69861,
69863,
69866,
69896,
69897,
69912,
69938,
69939,
69941,
69946,
69947,
69948,
69949,
69953,
70012,
866511,
1289369,
],
dtype=np.int32,
),
# Expected values are strings because the small3.vcf.gz defines END as a string
"info_END": pd.Series(
[
"12277",
"12771",
"13374",
"13395",
"13413",
"13451",
"13519",
"13544",
"13689",
"17479",
"17486",
"30553",
"35224",
"35531",
"35786",
"69096",
"69103",
"69104",
"69109",
"69110",
"69111",
"69112",
"69114",
"69115",
"69122",
"69123",
"69128",
"69129",
"69130",
"69192",
"69195",
"69196",
"69215",
"69222",
"69227",
"69228",
"69261",
"69262",
"69269",
None,
"69346",
"69349",
"69352",
"69353",
"69370",
"69510",
None,
"69760",
None,
"69770",
"69834",
"69835",
"69838",
"69861",
"69863",
"69866",
"69896",
None,
"69912",
"69938",
"69939",
"69941",
"69946",
"69947",
"69948",
"69949",
"69953",
"70012",
None,
None,
],
dtype=object,
),
}
)

# Ingest the data
uri = os.path.join(tmp_path, "dataset")
ds = tiledbvcf.Dataset(uri, mode="w")
samples = [os.path.join(TESTS_INPUT_DIR, s) for s in ["small3.vcf.gz"]]
ds.create_dataset()
ds.ingest_samples(samples)

# Read the data
ds = tiledbvcf.Dataset(uri)
df = ds.read(attrs=["sample_name", "pos_start", "pos_end", "info_END"])

# Sort the results because VCF uses an unordered reader
df.sort_values(ignore_index=True, by=["sample_name", "pos_start"], inplace=True)

# Drop the columns that are not used for comparison
df.drop(columns=["sample_name", "pos_start"], inplace=True)

# Check the results
_check_dfs(df, expected_end)
37 changes: 25 additions & 12 deletions libtiledbvcf/src/vcf/vcf_utils.cc
Expand Up @@ -31,20 +31,33 @@ namespace vcf {

uint32_t VCFUtils::get_end_pos(
bcf_hdr_t* hdr, bcf1_t* rec, HtslibValueMem* val) {
val->ndst = HtslibValueMem::convert_ndst_for_type(
val->ndst, BCF_HT_INT, &val->type_for_ndst);
int rc =
bcf_get_info_values(hdr, rec, "END", &val->dst, &val->ndst, BCF_HT_INT);
if (rc > 0) {
assert(val->dst);
assert(val->ndst >= 1);
uint32_t copy = ((uint32_t*)val->dst)[0];
assert(copy > 0);
copy -= 1;
return copy;
// Set the default value of END, in case it is not present in the VCF
uint32_t end = rec->pos + rec->rlen - 1;

int* dst = nullptr;
int ndst = 0;

// Check if END is present as an integer
int ret = bcf_get_info_int32(hdr, rec, "END", &dst, &ndst);
if (ret >= 0) {
end = *dst - 1;
} else {
return (uint32_t)rec->pos + rec->rlen - 1;
// Check if END is present as a string
ret = bcf_get_info_string(hdr, rec, "END", &dst, &ndst);
if (ret >= 0) {
std::string end_str = reinterpret_cast<char*>(dst);
try {
end = std::stoi(end_str) - 1;
} catch (std::invalid_argument const& e) {
throw std::runtime_error(
"Error parsing END field as an integer: " + end_str);
}
}
}

hts_free(dst);

return end;
}

bcf_hdr_t* VCFUtils::hdr_read_header(const std::string& path) {
Expand Down
7 changes: 0 additions & 7 deletions libtiledbvcf/src/write/writer_worker_v4.cc
Expand Up @@ -341,13 +341,6 @@ bool WriterWorkerV4::buffer_record(const RecordHeapV4::Node& node) {
int info_id = info->key;
const char* key = bcf_hdr_int2id(hdr, BCF_DT_ID, info_id);

// No need to store END.
if (strcmp("END", key) == 0) {
infos_extracted[i] = true;
n_info_as_attr++;
continue;
}

Buffer* buff;
if (buffers_.extra_attr(std::string("info_") + key, &buff)) {
// No need to store the string key, as it's an extracted attribute.
Expand Down
4 changes: 2 additions & 2 deletions libtiledbvcf/test/inputs/small3.vcf
Expand Up @@ -18,7 +18,7 @@
##INFO=<ID=ClippingRankSum,Number=1,Type=Float,Description="Z-score From Wilcoxon rank sum test of Alt vs. Ref number of hard clipped bases">
##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">
##INFO=<ID=DS,Number=0,Type=Flag,Description="Were any of the samples downsampled?">
##INFO=<ID=END,Number=1,Type=Integer,Description="Stop position of the interval">
##INFO=<ID=END,Number=1,Type=String,Description="Stop position of the interval">
##INFO=<ID=HaplotypeScore,Number=1,Type=Float,Description="Consistency of the site with at most two segregating haplotypes">
##INFO=<ID=InbreedingCoeff,Number=1,Type=Float,Description="Inbreeding coefficient as estimated from the genotype likelihoods per-sample when compared against the Hardy-Weinberg expectation">
##INFO=<ID=MLEAC,Number=A,Type=Integer,Description="Maximum likelihood expectation (MLE) for the allele counts (not necessarily the same as the AC), for each ALT allele, in the same order as listed">
Expand Down Expand Up @@ -186,4 +186,4 @@
1 69950 . T <NON_REF> . . END=69953 GT:DP:GQ:MIN_DP:PL 0/0:5:0:5:0,0,101
1 69954 . G <NON_REF> . . END=70012 GT:DP:GQ:MIN_DP:PL 0/0:4:9:3:0,6,90
1 866511 . T CCCCTCCCT,C,CCCCTCCCTCCCT,CCCCT . LowQual . GT 4/1
1 1289367 rs1497816 CTG C . LowQual . GT 0/1
1 1289367 rs1497816 CTG C . LowQual . GT 0/1
Binary file modified libtiledbvcf/test/inputs/small3.vcf.gz
Binary file not shown.
Binary file modified libtiledbvcf/test/inputs/small3.vcf.gz.csi
Binary file not shown.

0 comments on commit 28c5e5d

Please sign in to comment.