Skip to content

Commit

Permalink
[feat] doNJ can now use vcf ind names as labels
Browse files Browse the repository at this point in the history
Add option to make doNJ use ind names from vcf file as tip labels
instead of requiring a metadata file input

Add realloc macro with safe null checking
  • Loading branch information
isinaltinkaya committed Jul 5, 2023
1 parent 66d36a2 commit 0c2e216
Show file tree
Hide file tree
Showing 11 changed files with 245 additions and 151 deletions.
34 changes: 13 additions & 21 deletions dataStructs.cpp
Expand Up @@ -813,14 +813,6 @@ const int calculateBufferSizeCsv(const int n_vals, const int max_digits) {
return ((n_vals * (max_digits + 1)) + 2);
}

void distanceMatrixStruct::set_item_labels(char **itemLabelArr) {
ASSERT(itemLabelArr != NULL);
itemLabels = (char **)malloc(nInd * sizeof(char *));
for (int i = 0; i < nInd; i++) {
itemLabels[i] = strdup(itemLabelArr[i]);
}
}

void distanceMatrixStruct::print() {
if (0 == args->printDistanceMatrix)
return;
Expand Down Expand Up @@ -853,12 +845,12 @@ distanceMatrixStruct::distanceMatrixStruct(int nInd_, int nIndCmb_, int isSquare
}
isSquared = isSquared_;

if (itemLabels_ != NULL) {
itemLabels = (char **)malloc(nInd * sizeof(char *));
for (int i = 0; i < nInd; i++) {
itemLabels[i] = strdup(itemLabels_[i]);
}
}
// if (itemLabels_ != NULL) {
// itemLabels = (char **)malloc(nInd * sizeof(char *));
// for (int i = 0; i < nInd; i++) {
// itemLabels[i] = strdup(itemLabels_[i]);
// }
// }

inds2idx = (int **)malloc(nInd * sizeof(int *));
for (int i = 0; i < nInd; i++) {
Expand All @@ -881,13 +873,13 @@ distanceMatrixStruct::distanceMatrixStruct(int nInd_, int nIndCmb_, int isSquare
distanceMatrixStruct::~distanceMatrixStruct() {
delete[] M;

if (itemLabels != NULL) {
for (int i = 0; i < nInd; i++) {
FREE(itemLabels[i]);
}
FREE(itemLabels);
}

// if (itemLabels != NULL) {
// for (int i = 0; i < nInd; i++) {
// FREE(itemLabels[i]);
// }
// FREE(itemLabels);
// }
//
for (int i = 0; i < nInd; i++) {
FREE(inds2idx[i]);
}
Expand Down
5 changes: 2 additions & 3 deletions dataStructs.h
Expand Up @@ -256,7 +256,7 @@ struct distanceMatrixStruct {
// inds2idx[i1][i2] = index of the pair (i1,i2) in the distance matrix
int **inds2idx = NULL;

char **itemLabels = NULL;
// char **itemLabels = NULL;

int nInd = 0;
int nIndCmb = 0;
Expand All @@ -267,7 +267,6 @@ struct distanceMatrixStruct {

void print();

void set_item_labels(char **indNames);
};

/// @brief read distance matrix from distance matrix csv file
Expand Down Expand Up @@ -508,4 +507,4 @@ struct indPairThreads {
}
};

#endif // __DATA_STRUCTS__
#endif // __DATA_STRUCTS__
198 changes: 100 additions & 98 deletions neighborJoining.cpp
Expand Up @@ -66,7 +66,7 @@ void njStruct::_print(void) {
}
}

njStruct::njStruct(distanceMatrixStruct *dms) {
njStruct::njStruct(distanceMatrixStruct *dms, paramStruct* pars) {
DistanceMatrixSt = dms;

// initialize the number of leaf nodes
Expand Down Expand Up @@ -133,7 +133,7 @@ njStruct::njStruct(distanceMatrixStruct *dms) {
}
}

nodeLabels = (char **)malloc(nTreeNodes * sizeof(char *));
nodeLabels = (char **)malloc(nTreeNodes * sizeof(char *));
for (int i = 0; i < nTreeNodes; ++i) {
if (i < totL) {
for (int j = i + 1; j < totL; ++j) {
Expand All @@ -145,20 +145,24 @@ njStruct::njStruct(distanceMatrixStruct *dms) {
}
}

nodeLabels[i] = NULL;
if (i < totL) {
if (DistanceMatrixSt->itemLabels != NULL) {
ASSERT(DistanceMatrixSt->itemLabels[i] != NULL);
nodeLabels[i] = strdup(DistanceMatrixSt->itemLabels[i]);
if (pars->indNames != NULL) {
ASSERT(pars->indNames->vals!=NULL);
ASSERT(pars->indNames->vals[i]!=NULL);
ASSERT(nodeLabels!=NULL);
ASSERT(i<=pars->indNames->nvals);
nodeLabels[i]=strdup(pars->indNames->vals[i]);
// nodeLabels = pars->indNames->vals;
// ASSERT(nodeLabels!=NULL);
} else {
char label[100];
sprintf(label, "item%d", i);
nodeLabels[i] = strdup(label);
}
} else {
char label[100];
sprintf(label, "node%d", i);
nodeLabels[i] = strdup(label);
sprintf(label, "node%d", i);
nodeLabels[i] = strdup(label);
}
}

Expand All @@ -175,10 +179,10 @@ njStruct::njStruct(distanceMatrixStruct *dms) {
njStruct::~njStruct() {
FREE(NJD);
for (int i = 0; i < nTreeNodes; ++i) {
FREE(nodeLabels[i]);
FREE(nodeLabels[i]);
FREE(items2idx[i]);
}
FREE(nodeLabels);
FREE(nodeLabels);
FREE(items2idx);

for (int i = 0; i < nTreeEdges; ++i) {
Expand Down Expand Up @@ -315,15 +319,13 @@ void njIteration(njStruct *nji) {
// calculate the distance from the new node to each child node
// d(min_i1,new_node) = ( d(min_i1,min_i2) + NetDivergence[min_i1] - NetDivergence[min_i2] ) / 2
edgeLength = (0.5 * (min_dist + min_NetDivergence));
// nji->addEdge(min_i1, parentNode, edgeLength);
nji->addEdge(parentNode, min_i1, edgeLength);
int pxnew1 = nji->items2idx[min_i1][parentNode];
nji->NJD[pxnew1] = edgeLength;

// calculate the distance from the new node to the child node 2
// d(min_i2,new_node) = ( d(min_i1,min_i2) + NetDivergence[min_i2] - NetDivergence[min_i1] ) / 2
edgeLength = (0.5 * (min_dist - min_NetDivergence));
// nji->addEdge(min_i2, parentNode, edgeLength);
nji->addEdge(parentNode, min_i2, edgeLength);
int pxnew2 = nji->items2idx[min_i2][parentNode];
nji->NJD[pxnew2] = edgeLength;
Expand Down Expand Up @@ -399,7 +401,7 @@ int njStruct::newParentNode(int child1, int child2) {

njStruct *njStruct_get(paramStruct *pars, dxyStruct *dxy) {
ASSERT(dxy != NULL);
njStruct *nj = new njStruct(dxy);
njStruct *nj = new njStruct(dxy,pars);

for (int i = 0; i < nj->nTreeIterations; i++) {
njIteration(nj);
Expand All @@ -414,7 +416,7 @@ njStruct *njStruct_get(paramStruct *pars, dxyStruct *dxy) {

njStruct *njStruct_get(paramStruct *pars, distanceMatrixStruct *dms) {
ASSERT(dms != NULL);
njStruct *nj = new njStruct(dms);
njStruct *nj = new njStruct(dms,pars);

for (int i = 0; i < nj->nTreeIterations; i++) {
njIteration(nj);
Expand All @@ -425,88 +427,88 @@ njStruct *njStruct_get(paramStruct *pars, distanceMatrixStruct *dms) {
}

// TODO do this with a template type for both dxy and distanceMatrix
njStruct::njStruct(dxyStruct *dxy) {
// TODO -limit to only one level dxy

dxySt = dxy;

// initialize the number of leaf nodes
// get number of pairs of objects (individuals or groups) in the distance matrix
totL = (1 + (sqrt(1 + (8 * dxySt->nDxy)))) / 2;
iterL = totL;

// an unrooted tree with n leaves has 2n-2 nodes and 2n-3 edges
nTreeNodes = (2 * totL) - 2;
nTreeEdges = nTreeNodes - 1;

nTreeNodePairs = (nTreeNodes * (nTreeNodes - 1)) / 2;
NJD = (double *)malloc(nTreeNodePairs * sizeof(double));
for (int i = 0; i < nTreeNodePairs; ++i) {
NJD[i] = 0.0;
}

// number of neighbor joining iterations needed to build the tree
nTreeIterations = totL - 2;

edgeLengths = (double *)malloc(nTreeEdges * sizeof(double));

edgeNodes = (int **)malloc(nTreeEdges * sizeof(int *));
for (int i = 0; i < nTreeEdges; ++i) {
edgeNodes[i] = (int *)malloc(2 * sizeof(int));
edgeNodes[i][0] = -1;
edgeNodes[i][1] = -1;

edgeLengths[i] = -1.0;
}

idx2items = (int **)malloc(nTreeNodePairs * sizeof(int *));
for (int i = 0; i < nTreeNodePairs; ++i) {
idx2items[i] = (int *)malloc(2 * sizeof(int));
idx2items[i][0] = -1;
idx2items[i][1] = -1;
}
items2idx = (int **)malloc(nTreeNodes * sizeof(int *));
for (int i = 0; i < nTreeNodes; ++i) {
items2idx[i] = (int *)malloc(nTreeNodes * sizeof(int));
for (int j = 0; j < nTreeNodes; ++j) {
items2idx[i][j] = -1;
}
}

int idx = 0;
for (int i1 = 0; i1 < nTreeNodes - 1; i1++) {
for (int i2 = i1 + 1; i2 < nTreeNodes; i2++) {
idx2items[idx][0] = i1;
idx2items[idx][1] = i2;
items2idx[i1][i2] = idx;
items2idx[i2][i1] = idx;
idx++;
}
}

nodeLabels = (char **)malloc(nTreeNodes * sizeof(char *));
for (int i = 0; i < nTreeNodes; ++i) {
if (i < totL) {
for (int j = i + 1; j < totL; ++j) {
int new_index = items2idx[i][j];
int old_index = nCk_idx(totL, i, j);
ASSERT(new_index >= 0);
ASSERT(old_index >= 0);
NJD[new_index] = dxy->dxyArr[old_index];
}
}

// TODO get groupnames, do this after changing dxy to be one struct per level
char label[100];
sprintf(label, "node%d", i);
nodeLabels[i] = strdup(label);
}

// number of neighbors identified in the previous iterations == nNodes-2
// since we terminate the iterations when nNodes==2, so we don't need to
// save the 2 neighbors identified in the last iteration
neighborIdx = (int *)malloc((nTreeNodes - 2) * sizeof(int));
for (int i = 0; i < (nTreeNodes - 2); ++i) {
neighborIdx[i] = -1;
}
njStruct::njStruct(dxyStruct *dxy, paramStruct* pars){
// TODO -limit to only one level dxy

dxySt = dxy;

// initialize the number of leaf nodes
// get number of pairs of objects (individuals or groups) in the distance matrix
totL = (1 + (sqrt(1 + (8 * dxySt->nDxy)))) / 2;
iterL = totL;

// an unrooted tree with n leaves has 2n-2 nodes and 2n-3 edges
nTreeNodes = (2 * totL) - 2;
nTreeEdges = nTreeNodes - 1;

nTreeNodePairs = (nTreeNodes * (nTreeNodes - 1)) / 2;
NJD = (double *)malloc(nTreeNodePairs * sizeof(double));
for (int i = 0; i < nTreeNodePairs; ++i) {
NJD[i] = 0.0;
}

// number of neighbor joining iterations needed to build the tree
nTreeIterations = totL - 2;

edgeLengths = (double *)malloc(nTreeEdges * sizeof(double));

edgeNodes = (int **)malloc(nTreeEdges * sizeof(int *));
for (int i = 0; i < nTreeEdges; ++i) {
edgeNodes[i] = (int *)malloc(2 * sizeof(int));
edgeNodes[i][0] = -1;
edgeNodes[i][1] = -1;

edgeLengths[i] = -1.0;
}

idx2items = (int **)malloc(nTreeNodePairs * sizeof(int *));
for (int i = 0; i < nTreeNodePairs; ++i) {
idx2items[i] = (int *)malloc(2 * sizeof(int));
idx2items[i][0] = -1;
idx2items[i][1] = -1;
}
items2idx = (int **)malloc(nTreeNodes * sizeof(int *));
for (int i = 0; i < nTreeNodes; ++i) {
items2idx[i] = (int *)malloc(nTreeNodes * sizeof(int));
for (int j = 0; j < nTreeNodes; ++j) {
items2idx[i][j] = -1;
}
}

int idx = 0;
for (int i1 = 0; i1 < nTreeNodes - 1; i1++) {
for (int i2 = i1 + 1; i2 < nTreeNodes; i2++) {
idx2items[idx][0] = i1;
idx2items[idx][1] = i2;
items2idx[i1][i2] = idx;
items2idx[i2][i1] = idx;
idx++;
}
}

nodeLabels = (char **)malloc(nTreeNodes * sizeof(char *));
for (int i = 0; i < nTreeNodes; ++i) {
if (i < totL) {
for (int j = i + 1; j < totL; ++j) {
int new_index = items2idx[i][j];
int old_index = nCk_idx(totL, i, j);
ASSERT(new_index >= 0);
ASSERT(old_index >= 0);
NJD[new_index] = dxy->dxyArr[old_index];
}
}

// TODO get groupnames, do this after changing dxy to be one struct per level
char label[100];
sprintf(label, "node%d", i);
nodeLabels[i] = strdup(label);
}

// number of neighbors identified in the previous iterations == nNodes-2
// since we terminate the iterations when nNodes==2, so we don't need to
// save the 2 neighbors identified in the last iteration
neighborIdx = (int *)malloc((nTreeNodes - 2) * sizeof(int));
for (int i = 0; i < (nTreeNodes - 2); ++i) {
neighborIdx[i] = -1;
}
}
6 changes: 3 additions & 3 deletions neighborJoining.h
Expand Up @@ -123,8 +123,8 @@ typedef struct njStruct {
/// iteration.
int isNeighbor(int nodeIndex);

njStruct(distanceMatrixStruct *dms);
njStruct(dxyStruct *dxySt);
njStruct(distanceMatrixStruct *dms, paramStruct* pars);
njStruct(dxyStruct *dxySt, paramStruct* pars);
~njStruct();

} njStruct;
Expand Down Expand Up @@ -157,4 +157,4 @@ void njIteration(njStruct *nji);
// edgeNodes[edge_index][0] = parent node
// edgeNodes[edge_index][1] = child node

#endif
#endif

0 comments on commit 0c2e216

Please sign in to comment.