From d4aa4871122b35ac92e2cc13d9b1b1e9c5b5dc5c Mon Sep 17 00:00:00 2001 From: thegenemyers Date: Mon, 15 Jun 2015 12:31:31 +0200 Subject: [PATCH] Interactive version & alignment improvements MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit DB.[ch], QV.[ch], and align.[ch] can now be compiled in an interactive version by setting the defined constant INTERACTIVE in DB.h. Rather than printing an error message to stderr and exiting with code 1, the interactive version of the routines place an error message in a buffer EPLACE and return an error code to the caller. The aligner used O(N+M) space where N and M are the lengths of the two “reads” being compared. Now a fixed amount of memory is used so that one can compare for example 2 chromosomes against each other without exceeding memory. The daligner filter has been upgraded so that when comparing two very large sequences (say entire chromosomes), it is guaranteed to explore and report all independent alignments paths in the underlying edit graph. --- DB.c | 438 +++++++++++++++++++++++------------ DB.h | 83 +++++-- QV.c | 287 ++++++++++++++++------- QV.h | 35 ++- align.c | 688 +++++++++++++++++++++++++++++++++++++------------------ align.h | 64 ++++-- filter.c | 240 ++++++++++++------- 7 files changed, 1241 insertions(+), 594 deletions(-) diff --git a/DB.c b/DB.c index f0f654c..27b202e 100644 --- a/DB.c +++ b/DB.c @@ -73,14 +73,20 @@ char *Prog_Name; +#ifdef INTERACTIVE + +char Ebuffer[1000]; + +#endif + void *Malloc(int64 size, char *mesg) { void *p; if ((p = malloc(size)) == NULL) { if (mesg == NULL) - fprintf(stderr,"%s: Out of memory\n",Prog_Name); + EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else - fprintf(stderr,"%s: Out of memory (%s)\n",Prog_Name,mesg); + EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } @@ -88,9 +94,9 @@ void *Malloc(int64 size, char *mesg) void *Realloc(void *p, int64 size, char *mesg) { if ((p = realloc(p,size)) == NULL) { if (mesg == NULL) - fprintf(stderr,"%s: Out of memory\n",Prog_Name); + EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else - fprintf(stderr,"%s: Out of memory (%s)\n",Prog_Name,mesg); + EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (p); } @@ -102,9 +108,9 @@ char *Strdup(char *name, char *mesg) return (NULL); if ((s = strdup(name)) == NULL) { if (mesg == NULL) - fprintf(stderr,"%s: Out of memory\n",Prog_Name); + EPRINTF(EPLACE,"%s: Out of memory\n",Prog_Name); else - fprintf(stderr,"%s: Out of memory (%s)\n",Prog_Name,mesg); + EPRINTF(EPLACE,"%s: Out of memory (%s)\n",Prog_Name,mesg); } return (s); } @@ -115,7 +121,7 @@ FILE *Fopen(char *name, char *mode) if (name == NULL || mode == NULL) return (NULL); if ((f = fopen(name,mode)) == NULL) - fprintf(stderr,"%s: Cannot open %s for '%s'\n",Prog_Name,name,mode); + EPRINTF(EPLACE,"%s: Cannot open %s for '%s'\n",Prog_Name,name,mode); return (f); } @@ -181,7 +187,7 @@ char *Catenate(char *path, char *sep, char *root, char *suffix) if (len > max) { max = ((int) (1.2*len)) + 100; if ((cat = (char *) realloc(cat,max+1)) == NULL) - { fprintf(stderr,"%s: Out of memory (Making path name for %s)\n",Prog_Name,root); + { EPRINTF(EPLACE,"%s: Out of memory (Making path name for %s)\n",Prog_Name,root); return (NULL); } } @@ -201,7 +207,7 @@ char *Numbered_Suffix(char *left, int num, char *right) if (len > max) { max = ((int) (1.2*len)) + 100; if ((suffix = (char *) realloc(suffix,max+1)) == NULL) - { fprintf(stderr,"%s: Out of memory (Making number suffix for %d)\n",Prog_Name,num); + { EPRINTF(EPLACE,"%s: Out of memory (Making number suffix for %d)\n",Prog_Name,num); return (NULL); } } @@ -383,19 +389,21 @@ void Number_Read(char *s) // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: -// -1: The DB could not be opened for a reason reported by the routine to stderr +// -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap int Open_DB(char* path, HITS_DB *db) -{ char *root, *pwd, *bptr, *fptr, *cat; - int nreads; - FILE *index, *dbvis; - int status, plen, isdam; - int part, cutoff, all; - int ufirst, tfirst, ulast, tlast; +{ HITS_DB dbcopy; + char *root, *pwd, *bptr, *fptr, *cat; + int nreads; + FILE *index, *dbvis; + int status, plen, isdam; + int part, cutoff, all; + int ufirst, tfirst, ulast, tlast; status = -1; + dbcopy = *db; plen = strlen(path); if (strcmp(path+(plen-4),".dam") == 0) @@ -418,22 +426,24 @@ int Open_DB(char* path, HITS_DB *db) isdam = 0; cat = Catenate(pwd,"/",root,".db"); if (cat == NULL) - exit (1); + return (-1); if ((dbvis = fopen(cat,"r")) == NULL) { cat = Catenate(pwd,"/",root,".dam"); if (cat == NULL) - exit (1); + return (-1); if ((dbvis = fopen(cat,"r")) == NULL) - { fprintf(stderr,"%s: Could not open database %s\n",Prog_Name,path); - goto exit; + { EPRINTF(EPLACE,"%s: Could not open database %s\n",Prog_Name,path); + goto error; } isdam = 1; } if ((index = Fopen(Catenate(pwd,PATHSEP,root,".idx"),"r")) == NULL) - goto exit1; + goto error1; if (fread(db,sizeof(HITS_DB),1,index) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + goto error2; + } { int p, nblocks, nfiles; int64 size; @@ -441,35 +451,45 @@ int Open_DB(char* path, HITS_DB *db) nblocks = 0; if (fscanf(dbvis,DB_NFILE,&nfiles) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } for (p = 0; p < nfiles; p++) if (fscanf(dbvis,DB_FDATA,&tlast,fname,prolog) != 3) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } if (fscanf(dbvis,DB_NBLOCK,&nblocks) != 1) if (part == 0) { cutoff = 0; all = 1; } else - { fprintf(stderr,"%s: DB %s has not yet been partitioned, cannot request a block !\n", + { EPRINTF(EPLACE,"%s: DB %s has not yet been partitioned, cannot request a block !\n", Prog_Name,root); - goto exit2; + goto error2; } else { if (fscanf(dbvis,DB_PARAMS,&size,&cutoff,&all) != 3) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } if (part > nblocks) - { fprintf(stderr,"%s: DB %s has only %d blocks\n",Prog_Name,root,nblocks); - goto exit2; + { EPRINTF(EPLACE,"%s: DB %s has only %d blocks\n",Prog_Name,root,nblocks); + goto error2; } } if (part > 0) { for (p = 1; p <= part; p++) if (fscanf(dbvis,DB_BDATA,&ufirst,&tfirst) != 2) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } if (fscanf(dbvis,DB_BDATA,&ulast,&tlast) != 2) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error2; + } } else { ufirst = tfirst = 0; @@ -491,7 +511,10 @@ int Open_DB(char* path, HITS_DB *db) { db->reads = (HITS_READ *) Malloc(sizeof(HITS_READ)*(nreads+2),"Allocating Open_DB index"); db->reads += 1; if (fread(db->reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + free(db->reads); + goto error2; + } } else { HITS_READ *reads; @@ -503,7 +526,10 @@ int Open_DB(char* path, HITS_DB *db) fseeko(index,sizeof(HITS_READ)*ufirst,SEEK_CUR); if (fread(reads,sizeof(HITS_READ),nreads,index) != (size_t) nreads) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + free(reads); + goto error2; + } totlen = 0; maxlen = 0; @@ -524,22 +550,27 @@ int Open_DB(char* path, HITS_DB *db) db->nreads = nreads; db->path = Strdup(Catenate(pwd,PATHSEP,root,""),"Allocating Open_DB path"); + if (db->path == NULL) + goto error2; db->bases = NULL; db->loaded = 0; status = isdam; -exit2: +error2: fclose(index); -exit1: +error1: fclose(dbvis); -exit: +error: if (bptr != NULL) *bptr = '.'; free(pwd); free(root); + if (status < 0) + *db = dbcopy; + return (status); } @@ -682,61 +713,58 @@ void Close_DB(HITS_DB *db) HITS_DB *Active_DB = NULL; // Last db/qv used by "Load_QVentry" HITS_QV *Active_QV; // Becomes invalid after closing -void Load_QVs(HITS_DB *db) +int Load_QVs(HITS_DB *db) { FILE *quiva, *istub, *indx; + char *root; uint16 *table; - QVcoding *coding; HITS_QV *qvtrk; + QVcoding *coding, *nx; + int ncodes; if (db->tracks != NULL && strcmp(db->tracks->name,".@qvs") == 0) - return; + return (0); if (db->trimmed) - { fprintf(stderr,"%s: Cannot load QVs after trimming the DB\n",Prog_Name); - exit (1); + { EPRINTF(EPLACE,"%s: Cannot load QVs after trimming the DB\n",Prog_Name); + EXIT(1); } if (db->reads[db->nreads-1].coff < 0) - { fprintf(stderr,"%s: The requested QVs have not been added to the DB!\n",Prog_Name); - exit (1); + { EPRINTF(EPLACE,"%s: The requested QVs have not been added to the DB!\n",Prog_Name); + EXIT(1); } // Open .qvs, .idx, and .db files quiva = Fopen(Catenate(db->path,"","",".qvs"),"r"); if (quiva == NULL) - return; - if (db->part > 0) - { indx = Fopen(Catenate(db->path,"","",".idx"),"r"); - if (indx == NULL) - exit (1); - } - else - indx = NULL; + return (-1); - { char *x; + istub = NULL; + indx = NULL; + table = NULL; + coding = NULL; + qvtrk = NULL; - x = rindex(db->path,'/'); - *x = '\0'; - istub = Fopen(Catenate(db->path,"/",x+2,".db"),"r"); - if (istub == NULL) - exit (1); - *x = '/'; - } + root = rindex(db->path,'/') + 2; + istub = Fopen(Catenate(db->path,"/",root,".db"),"r"); + if (istub == NULL) + goto error; - { int first, last, nfiles, ncodes; + { int first, last, nfiles; char prolog[MAX_NAME], fname[MAX_NAME]; int i, j; - table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); - if (fscanf(istub,DB_NFILE,&nfiles) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error; + } if (db->part > 0) - { int pfirst, plast; - int fbeg, fend; - int n, k; + { int pfirst, plast; + int fbeg, fend; + int n, k; + FILE *indx; // Determine first how many and which files span the block (fbeg to fend) @@ -746,7 +774,9 @@ void Load_QVs(HITS_DB *db) first = 0; for (fbeg = 0; fbeg < nfiles; fbeg++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error; + } if (last > pfirst) break; first = last; @@ -755,31 +785,36 @@ void Load_QVs(HITS_DB *db) { if (last >= plast) break; if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error; + } first = last; } + indx = Fopen(Catenate(db->path,"","",".idx"),"r"); ncodes = fend-fbeg; coding = (QVcoding *) Malloc(sizeof(QVcoding)*ncodes,"Allocating coding schemes"); + table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); + if (indx == NULL || coding == NULL || table == NULL) + { ncodes = 0; + goto error; + } // Carefully get the first coding scheme (its offset is most likely in a HITS_RECORD // in .idx that is *not* in memory). Get all the other coding schemes normally and // assign the tables # for each read in the block in "tables". rewind(istub); - if (fscanf(istub,DB_NFILE,&nfiles) != 1) - SYSTEM_ERROR + fscanf(istub,DB_NFILE,&nfiles); first = 0; for (n = 0; n < fbeg; n++) - { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) - SYSTEM_ERROR + { fscanf(istub,DB_FDATA,&last,fname,prolog); first = last; } for (n = fbeg; n < fend; n++) - { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) - SYSTEM_ERROR + { fscanf(istub,DB_FDATA,&last,fname,prolog); i = n-fbeg; if (first < pfirst) @@ -787,13 +822,26 @@ void Load_QVs(HITS_DB *db) fseeko(indx,sizeof(HITS_DB) + sizeof(HITS_READ)*first,SEEK_SET); if (fread(&read,sizeof(HITS_READ),1,indx) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Index file (.idx) of %s is junk\n",Prog_Name,root); + ncodes = i; + goto error; + } fseeko(quiva,read.coff,SEEK_SET); - coding[i] = *Read_QVcoding(quiva); + nx = Read_QVcoding(quiva); + if (nx == NULL) + { ncodes = i; + goto error; + } + coding[i] = *nx; } else { fseeko(quiva,db->reads[first-pfirst].coff,SEEK_SET); - coding[i] = *Read_QVcoding(quiva); + nx = Read_QVcoding(quiva); + if (nx == NULL) + { ncodes = i; + goto error; + } + coding[i] = *nx; db->reads[first-pfirst].coff = ftello(quiva); } @@ -807,9 +855,10 @@ void Load_QVs(HITS_DB *db) table[j++] = (uint16) i; first = last; - } + } fclose(indx); + indx = NULL; } else @@ -818,14 +867,24 @@ void Load_QVs(HITS_DB *db) ncodes = nfiles; coding = (QVcoding *) Malloc(sizeof(QVcoding)*nfiles,"Allocating coding schemes"); + table = (uint16 *) Malloc(sizeof(uint16)*db->nreads,"Allocating QV table indices"); + if (coding == NULL || table == NULL) + goto error; first = 0; for (i = 0; i < nfiles; i++) { if (fscanf(istub,DB_FDATA,&last,fname,prolog) != 3) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Stub file (.db) of %s is junk\n",Prog_Name,root); + goto error; + } fseeko(quiva,db->reads[first].coff,SEEK_SET); - coding[i] = *Read_QVcoding(quiva); + nx = Read_QVcoding(quiva); + if (nx == NULL) + { ncodes = i; + goto error; + } + coding[i] = *nx; db->reads[first].coff = ftello(quiva); for (j = first; j < last; j++) @@ -839,9 +898,13 @@ void Load_QVs(HITS_DB *db) // track list qvtrk = (HITS_QV *) Malloc(sizeof(HITS_QV),"Allocating QV pseudo-track"); + if (qvtrk == NULL) + goto error; + qvtrk->name = Strdup(".@qvs","Allocating QV pseudo-track name"); + if (qvtrk->name == NULL) + goto error; qvtrk->next = db->tracks; db->tracks = (HITS_TRACK *) qvtrk; - qvtrk->name = Strdup(".@qvs","Allocating QV pseudo-track name"); qvtrk->ncodes = ncodes; qvtrk->table = table; qvtrk->coding = coding; @@ -849,7 +912,25 @@ void Load_QVs(HITS_DB *db) } fclose(istub); - fclose(indx); + return (0); + +error: + if (qvtrk != NULL) + free(qvtrk); + if (table != NULL) + free(table); + if (coding != NULL) + { int i; + for (i = 0; i < ncodes; i++) + Free_QVcoding(coding+i); + free(coding); + } + if (indx != NULL) + fclose(indx); + if (istub != NULL) + fclose(istub); + fclose(quiva); + EXIT(1); } // Close the QV stream, free the QV pseudo track and all associated memory @@ -939,11 +1020,12 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track) int treads, ureads; void *anno; void *data; + char *name; HITS_TRACK *record; if (track[0] == '.') - { fprintf(stderr,"Track names cannot begin with a .\n"); - exit (1); + { EPRINTF(EPLACE,"%s: Track name, '%s', cannot begin with a .\n",Prog_Name,track); + EXIT(NULL); } for (record = db->tracks; record != NULL; record = record->next) @@ -960,17 +1042,35 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track) ispart = 0; } if (afile == NULL) - return (NULL); + { EPRINTF(EPLACE,"%s: Track '%s' does not exist\n",Prog_Name,track); + return (NULL); + } + + dfile = NULL; + anno = NULL; + data = NULL; + record = NULL; if (ispart) - dfile = fopen(Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".data"),"r"); + name = Catenate(db->path,Numbered_Suffix(".",db->part,"."),track,".data"); else - dfile = fopen(Catenate(db->path,".",track,".data"),"r"); + name = Catenate(db->path,".",track,".data"); + if (name == NULL) + goto error; + dfile = fopen(name,"r"); if (fread(&tracklen,sizeof(int),1,afile) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } if (fread(&size,sizeof(int),1,afile) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } + if (size <= 0) + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } if (ispart) { ureads = ((int *) (db->reads))[-1]; @@ -983,16 +1083,16 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track) if (db->trimmed) { if (tracklen != treads) - { fprintf(stderr,"%s: Track %s not same size as database !\n",Prog_Name,track); - exit (1); + { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); + goto error; } if ( ! ispart && db->part > 0) fseeko(afile,size*db->tfirst,SEEK_CUR); } else { if (tracklen != ureads) - { fprintf(stderr,"%s: Track %s not same size as database !\n",Prog_Name,track); - exit (1); + { EPRINTF(EPLACE,"%s: Track '%s' not same size as database !\n",Prog_Name,track); + goto error; } if ( ! ispart && db->part > 0) fseeko(afile,size*db->ufirst,SEEK_CUR); @@ -1000,25 +1100,19 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track) nreads = db->nreads; anno = (void *) Malloc(size*(nreads+1),"Allocating Track Anno Vector"); - - if (size > 0) - { if (dfile == NULL) - { if (fread(anno,size,nreads,afile) != (size_t) nreads) - SYSTEM_ERROR - } - else - { if (fread(anno,size,nreads+1,afile) != (size_t) (nreads+1)) - SYSTEM_ERROR - } - } - else - SYSTEM_ERROR + if (anno == NULL) + goto error; if (dfile != NULL) { int64 *anno8, off8, dlen; int *anno4, off4; int i; + if (fread(anno,size,nreads+1,afile) != (size_t) (nreads+1)) + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } + if (size == 4) { anno4 = (int *) anno; off4 = anno4[0]; @@ -1041,19 +1135,33 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track) dlen = anno8[nreads]; data = (void *) Malloc(dlen,"Allocating Track Data Vector"); } + if (data == NULL) + goto error; if (dlen > 0) { if (fread(data,dlen,1,dfile) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Track '%s' data file is junk\n",Prog_Name,track); + goto error; + } } fclose(dfile); + dfile = NULL; } else - data = NULL; + { if (fread(anno,size,nreads,afile) != (size_t) nreads) + { EPRINTF(EPLACE,"%s: Track '%s' annotation file is junk\n",Prog_Name,track); + goto error; + } + data = NULL; + } fclose(afile); record = (HITS_TRACK *) Malloc(sizeof(HITS_TRACK),"Allocating Track Record"); + if (record == NULL) + goto error; record->name = Strdup(track,"Allocating Track Name"); + if (record->name == NULL) + goto error; record->data = data; record->anno = anno; record->size = size; @@ -1068,6 +1176,18 @@ HITS_TRACK *Load_Track(HITS_DB *db, char *track) } return (record); + +error: + if (record == NULL) + free(record); + if (data != NULL) + free(data); + if (anno != NULL) + free(anno); + if (dfile != NULL) + fclose(dfile); + fclose(afile); + EXIT (NULL); } void Close_Track(HITS_DB *db, char *track) @@ -1106,7 +1226,7 @@ char *New_Read_Buffer(HITS_DB *db) read = (char *) Malloc(db->maxlen+4,"Allocating New Read Buffer"); if (read == NULL) - exit (1); + EXIT(NULL); return (read+1); } @@ -1116,20 +1236,21 @@ char *New_Read_Buffer(HITS_DB *db) // // **NB**, the byte before read will be set to a delimiter character! -void Load_Read(HITS_DB *db, int i, char *read, int ascii) +int Load_Read(HITS_DB *db, int i, char *read, int ascii) { FILE *bases = (FILE *) db->bases; int64 off; int len, clen; HITS_READ *r = db->reads; + if (i >= db->nreads) + { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); + EXIT(1); + } if (bases == NULL) - { db->bases = (void *) (bases = Fopen(Catenate(db->path,"","",".bps"),"r")); + { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) - exit (1); - } - if (i >= db->nreads) - { fprintf(stderr,"%s: Index out of bounds (Load_Read)\n",Prog_Name); - exit (1); + EXIT(1); + db->bases = (void *) bases; } off = r[i].boff; @@ -1140,7 +1261,9 @@ void Load_Read(HITS_DB *db, int i, char *read, int ascii) clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(read,clen,1,bases) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); + EXIT(1); + } } Uncompress_Read(len,read); if (ascii == 1) @@ -1153,6 +1276,7 @@ void Load_Read(HITS_DB *db, int i, char *read, int ascii) } else read[-1] = 4; + return (0); } char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii) @@ -1162,14 +1286,15 @@ char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii) int bbeg, bend; HITS_READ *r = db->reads; + if (i >= db->nreads) + { EPRINTF(EPLACE,"%s: Index out of bounds (Load_Read)\n",Prog_Name); + EXIT(NULL); + } if (bases == NULL) - { db->bases = (void *) (bases = Fopen(Catenate(db->path,"","",".bps"),"r")); + { bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) - exit (1); - } - if (i >= db->nreads) - { fprintf(stderr,"%s: Index out of bounds (Load_Read)\n",Prog_Name); - exit (1); + EXIT(NULL); + db->bases = (void *) bases; } bbeg = beg/4; @@ -1183,7 +1308,9 @@ char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii) clen = bend-bbeg; if (clen > 0) { if (fread(read,clen,1,bases) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Failed read of .bps file (Load_Read)\n",Prog_Name); + EXIT(NULL); + } } Uncompress_Read(4*clen,read); read += beg%4; @@ -1219,7 +1346,7 @@ char **New_QV_Buffer(HITS_DB *db) qvs = (char *) Malloc(db->maxlen*5,"Allocating New QV Buffer"); entry = (char **) Malloc(sizeof(char *)*5,"Allocating New QV Buffer"); if (qvs == NULL || entry == NULL) - exit (1); + EXIT(NULL); for (i = 0; i < 5; i++) entry[i] = qvs + i*db->maxlen; return (entry); @@ -1228,22 +1355,22 @@ char **New_QV_Buffer(HITS_DB *db) // Load into entry the QV streams for the i'th read from db. The parameter ascii applies to // the DELTAG stream as described for Load_Read. -void Load_QVentry(HITS_DB *db, int i, char **entry, int ascii) +int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii) { HITS_READ *reads; FILE *quiva; int rlen; if (db != Active_DB) { if (db->tracks == NULL || strcmp(db->tracks->name,".@qvs") != 0) - { fprintf(stderr,"%s: QV's are not loaded!\n",Prog_Name); - exit (1); + { EPRINTF(EPLACE,"%s: QV's are not loaded (Load_QVentry)\n",Prog_Name); + EXIT(1); } Active_QV = (HITS_QV *) db->tracks; Active_DB = db; } if (i >= db->nreads) - { fprintf(stderr,"%s: Index out of bounds (Load_QVentry)\n",Prog_Name); - exit (1); + { EPRINTF(EPLACE,"%s: Index out of bounds (Load_QVentry)\n",Prog_Name); + EXIT(1); } reads = db->reads; @@ -1251,7 +1378,8 @@ void Load_QVentry(HITS_DB *db, int i, char **entry, int ascii) rlen = reads[i].rlen; fseeko(quiva,reads[i].coff,SEEK_SET); - Uncompress_Next_QVentry(quiva,entry,Active_QV->coding+Active_QV->table[i],rlen); + if (Uncompress_Next_QVentry(quiva,entry,Active_QV->coding+Active_QV->table[i],rlen)) + EXIT(1); if (ascii != 1) { char *deltag = entry[1]; @@ -1270,6 +1398,8 @@ void Load_QVentry(HITS_DB *db, int i, char **entry, int ascii) deltag[j] = (char) (deltag[j]+u); } } + + return (0); } @@ -1285,8 +1415,8 @@ void Load_QVentry(HITS_DB *db, int i, char **entry, int ascii) // non-zero then the reads are converted to ACGT ascii, otherwise the reads are left // as numeric strings over 0(A), 1(C), 2(G), and 3(T). -void Read_All_Sequences(HITS_DB *db, int ascii) -{ FILE *bases = (FILE *) db->bases; +int Read_All_Sequences(HITS_DB *db, int ascii) +{ FILE *bases; int nreads = db->nreads; HITS_READ *reads = db->reads; void (*translate)(char *s); @@ -1295,12 +1425,15 @@ void Read_All_Sequences(HITS_DB *db, int ascii) int64 o, off; int i, len, clen; + bases = Fopen(Catenate(db->path,"","",".bps"),"r"); if (bases == NULL) - db->bases = (void *) (bases = Fopen(Catenate(db->path,"","",".bps"),"r")); - else - rewind(bases); + EXIT(1); seq = (char *) Malloc(db->totlen+nreads+4,"Allocating All Sequence Reads"); + if (seq == NULL) + { fclose(bases); + EXIT(1); + } *seq++ = 4; @@ -1318,7 +1451,11 @@ void Read_All_Sequences(HITS_DB *db, int ascii) clen = COMPRESSED_LEN(len); if (clen > 0) { if (fread(seq+o,clen,1,bases) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"%s: Read of .bps file failed (Read_All_Sequences)\n",Prog_Name); + free(seq); + fclose(bases); + EXIT(1); + } } Uncompress_Read(len,seq+o); if (ascii) @@ -1332,9 +1469,11 @@ void Read_All_Sequences(HITS_DB *db, int ascii) db->bases = (void *) seq; db->loaded = 1; + + return (0); } -int List_DB_Files(char *path, void foreach(char *path, char *extension)) +int List_DB_Files(char *path, void actor(char *path, char *extension)) { int status, plen, rlen, dlen; char *root, *pwd, *name; int isdam; @@ -1351,13 +1490,15 @@ int List_DB_Files(char *path, void foreach(char *path, char *extension)) rlen = strlen(root); if (root == NULL || pwd == NULL) - { status = 1; - goto exit; + { free(pwd); + free(root); + EXIT(1); } if ((dirp = opendir(pwd)) == NULL) - { status = 1; - goto exit; + { EPRINTF(EPLACE,"%s: Cannot open directory %s (List_DB_Files)\n",Prog_Name,pwd); + status = -1; + goto error; } isdam = 0; @@ -1380,15 +1521,16 @@ int List_DB_Files(char *path, void foreach(char *path, char *extension)) } } if (dp == NULL) - { status = 1; + { EPRINTF(EPLACE,"%s: Cannot find %s (List_DB_Files)\n",Prog_Name,pwd); + status = -1; closedir(dirp); - goto exit; + goto error; } if (isdam) - foreach(Catenate(pwd,"/",root,".dam"),"dam"); + actor(Catenate(pwd,"/",root,".dam"),"dam"); else - foreach(Catenate(pwd,"/",root,".db"),"db"); + actor(Catenate(pwd,"/",root,".db"),"db"); rewinddir(dirp); // Report each auxiliary file while ((dp = readdir(dirp)) != NULL) @@ -1406,11 +1548,11 @@ int List_DB_Files(char *path, void foreach(char *path, char *extension)) continue; if (strncmp(name,root,rlen) != 0) continue; - foreach(Catenate(pwd,PATHSEP,name,""),name+(rlen+1)); + actor(Catenate(pwd,PATHSEP,name,""),name+(rlen+1)); } closedir(dirp); -exit: +error: free(pwd); free(root); return (status); diff --git a/DB.h b/DB.h index 471c84a..567a91a 100644 --- a/DB.h +++ b/DB.h @@ -57,6 +57,36 @@ #include "QV.h" +#define HIDE_FILES // Auxiliary DB files start with a . so they are "hidden" + // Undefine if you don't want this + +// For interactive applications where it is inappropriate to simply exit with an error +// message to standard error, define the constant INTERACTIVE. If set, then error +// messages are put in the global variable Ebuffer and the caller of a DB routine +// can decide how to deal with the error. +// +// DB, QV, or alignment routines that can encounter errors function as before in +// non-INTERACTIVE mode by exiting after printing an error message to stderr. In +// INTERACTIVE mode the routines place a message at EPLACE and return an error +// value. For such routines that were previously void, they are now int, and +// return 1 if an error occured, 0 otherwise. + +#undef INTERACTIVE + +#ifdef INTERACTIVE + +#define EPRINTF sprintf +#define EPLACE Ebuffer +#define EXIT(x) return (x) + +#else // BATCH + +#define EPRINTF fprintf +#define EPLACE stderr +#define EXIT(x) exit (1) + +#endif + typedef unsigned char uint8; typedef unsigned short uint16; typedef unsigned int uint32; @@ -68,9 +98,6 @@ typedef signed long long int64; typedef float float32; typedef double float64; -#define HIDE_FILES // Auxiliary DB files start with a . so they are "hidden" - // Undefine if you don't want this - /******************************************************************************************* * @@ -80,8 +107,14 @@ typedef double float64; extern char *Prog_Name; // Name of program +#ifdef INTERACTIVE + +extern char Ebuffer[]; + +#endif + #define SYSTEM_ERROR \ - { fprintf(stderr,"%s: System error, read failed!\n",Prog_Name); \ + { EPRINTF(EPLACE,"%s: System error, read failed!\n",Prog_Name); \ exit (2); \ } @@ -291,7 +324,7 @@ typedef struct // a part # in it then just the part is opened. The index array is allocated (for all or // just the part) and read in. // Return status of routine: - // -1: The DB could not be opened for a reason reported by the routine to stderr + // -1: The DB could not be opened for a reason reported by the routine to EPLACE // 0: Open of DB proceeded without mishap // 1: Open of DAM proceeded without mishap @@ -309,10 +342,12 @@ void Trim_DB(HITS_DB *db); void Close_DB(HITS_DB *db); - // If QV pseudo track is not already in db's track list, then set it up and return a pointer - // to it. The database must not have been trimmed yet. + // If QV pseudo track is not already in db's track list, then load it and set it up. + // The database must not have been trimmed yet. -1 is returned if a .qvs file is not + // present, and 1 is returned if an error (reported to EPLACE) occured and INTERACTIVE + // is defined. Otherwise a 0 is returned. -void Load_QVs(HITS_DB *db); +int Load_QVs(HITS_DB *db); // Remove the QV pseudo track, all space associated with it, and close the .qvs file. @@ -329,7 +364,9 @@ int Check_Track(HITS_DB *db, char *track); // If track is not already in the db's track list, then allocate all the storage for it, // read it in from the appropriate file, add it to the track list, and return a pointer // to the newly created HITS_TRACK record. If the track does not exist or cannot be - // opened for some reason, then NULL is returned. + // opened for some reason, then NULL is returned if INTERACTIVE is defined. Otherwise + // the routine prints an error message to stderr and exits if an error occurs, and returns + // with NULL only if the track does not exist. HITS_TRACK *Load_Track(HITS_DB *db, char *track); @@ -340,27 +377,31 @@ void Close_Track(HITS_DB *db, char *track); // Allocate and return a buffer big enough for the largest read in 'db'. // **NB** free(x-1) if x is the value returned as *prefix* and suffix '\0'(4)-byte - // are needed by the alignment algorithms. + // are needed by the alignment algorithms. If cannot allocate memory then return NULL + // if INTERACTIVE is defined, or print error to stderr and exit otherwise. char *New_Read_Buffer(HITS_DB *db); // Load into 'read' the i'th read in 'db'. As a lower case ascii string if ascii is 1, an // upper case ascii string if ascii is 2, and a numeric string over 0(A), 1(C), 2(G), and 3(T) // otherwise. A '\0' (or 4) is prepended and appended to the string so it has a delimeter - // for traversals in either direction. + // for traversals in either direction. A non-zero value is returned if an error occured + // and INTERACTIVE is defined. -void Load_Read(HITS_DB *db, int i, char *read, int ascii); +int Load_Read(HITS_DB *db, int i, char *read, int ascii); // Load into 'read' the subread [beg,end] of the i'th read in 'db' and return a pointer to the // the start of the subinterval (not necessarily = to read !!! ). As a lower case ascii // string if ascii is 1, an upper case ascii string if ascii is 2, and a numeric string // over 0(A), 1(C), 2(G), and 3(T) otherwise. A '\0' (or 4) is prepended and appended to // the string holding the substring so it has a delimeter for traversals in either direction. + // A NULL pointer is returned if an error occured and INTERACTIVE is defined. char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii); // Allocate a set of 5 vectors large enough to hold the longest QV stream that will occur - // in the database. + // in the database. If cannot allocate memory then return NULL if INTERACTIVE is defined, + // or print error to stderr and exit otherwise. #define DEL_QV 0 // The deletion QVs are x[DEL_QV] if x is the buffer returned by New_QV_Buffer #define DEL_TAG 1 // The deleted characters @@ -371,27 +412,31 @@ char *Load_Subread(HITS_DB *db, int i, int beg, int end, char *read, int ascii); char **New_QV_Buffer(HITS_DB *db); // Load into 'entry' the 5 QV vectors for i'th read in 'db'. The deletion tag or characters - // are converted to a numeric or upper/lower case ascii string as per ascii. + // are converted to a numeric or upper/lower case ascii string as per ascii. Return with + // a zero, except when an error occurs and INTERACTIVE is defined in which case return wtih 1. -void Load_QVentry(HITS_DB *db, int i, char **entry, int ascii); +int Load_QVentry(HITS_DB *db, int i, char **entry, int ascii); // Allocate a block big enough for all the uncompressed sequences, read them into it, // reset the 'off' in each read record to be its in-memory offset, and set the // bases pointer to point at the block after closing the bases file. If ascii is // 1 then the reads are converted to lowercase ascii, if 2 then uppercase ascii, and // otherwise the reads are left as numeric strings over 0(A), 1(C), 2(G), and 3(T). + // Return with a zero, except when an error occurs and INTERACTIVE is defined in which + // case return wtih 1. -void Read_All_Sequences(HITS_DB *db, int ascii); +int Read_All_Sequences(HITS_DB *db, int ascii); // For the DB or DAM "path" = "prefix/root[.db|.dam]", find all the files for that DB, i.e. all - // those of the form "prefix/[.]root.part" and call foreach with the complete path to each file + // those of the form "prefix/[.]root.part" and call actor with the complete path to each file // pointed at by path, and the suffix of the path by extension. The . proceeds the root // name if the defined constant HIDE_FILES is set. Always the first call is with the // path "prefix/root.db" and extension "db". There will always be calls for // "prefix/[.]root.idx" and "prefix/[.]root.bps". All other calls are for *tracks* and // so this routine gives one a way to know all the tracks associated with a given DB. - // Return non-zero iff path could not be opened for any reason. + // -1 is returned if the path could not be found, and 1 is returned if an error (reported + // to EPLACE) occured and INTERACTIVE is defined. Otherwise a 0 is returned. -int List_DB_Files(char *path, void foreach(char *path, char *extension)); +int List_DB_Files(char *path, void actor(char *path, char *extension)); #endif // _HITS_DB diff --git a/QV.c b/QV.c index ffbbf5c..38f6db4 100644 --- a/QV.c +++ b/QV.c @@ -192,7 +192,7 @@ static HScheme *Huffman(uint64 *hist, HScheme *inscheme) scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) - exit (1); + return (NULL); hsize = 0; // Load heap value = 0; @@ -365,22 +365,30 @@ static HScheme *Read_Scheme(FILE *in) scheme = (HScheme *) Malloc(sizeof(HScheme),"Allocating Huffman scheme record"); if (scheme == NULL) - exit (1); + return (NULL); lens = scheme->codelens; bits = scheme->codebits; look = scheme->lookup; if (fread(&x,1,1,in) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read scheme type byte (Read_Scheme)\n"); + free(scheme); + return (NULL); + } scheme->type = x; for (i = 0; i < 256; i++) { if (fread(&x,1,1,in) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read length of %d'th code (Read_Scheme)\n",i); + return (NULL); + } lens[i] = x; if (x > 0) { if (fread(bits+i,sizeof(uint32),1,in) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read bit encoding of %d'th code (Read_Scheme)\n",i); + free(scheme); + return (NULL); + } } else bits[i] = 0; @@ -536,7 +544,7 @@ static void Encode_Run(HScheme *neme, HScheme *reme, FILE *out, uint8 *read, int // Read and decode from in, the next rlen symbols into read according to scheme -static void Decode(HScheme *scheme, FILE *in, char *read, int rlen) +static int Decode(HScheme *scheme, FILE *in, char *read, int rlen) { int *look, *lens; int signal, ilen; uint64 icode; @@ -563,33 +571,37 @@ static void Decode(HScheme *scheme, FILE *in, char *read, int rlen) lens = scheme->codelens; look = scheme->lookup; -#define GET \ - if (n > ilen) \ - { icode <<= ilen; \ - if (fread(ipart,sizeof(uint32),1,in) != 1) \ - SYSTEM_ERROR \ - ilen = n-ilen; \ - icode <<= ilen; \ - ilen = 32-ilen; \ - } \ - else \ - { icode <<= n; \ - ilen -= n; \ +#define GET \ + if (n > ilen) \ + { icode <<= ilen; \ + if (fread(ipart,sizeof(uint32),1,in) != 1) \ + { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ + return (1); \ + } \ + ilen = n-ilen; \ + icode <<= ilen; \ + ilen = 32-ilen; \ + } \ + else \ + { icode <<= n; \ + ilen -= n; \ } -#define GETFLIP \ - if (n > ilen) \ - { icode <<= ilen; \ - if (fread(ipart,sizeof(uint32),1,in) != 1) \ - SYSTEM_ERROR \ - Flip_Long(ipart); \ - ilen = n-ilen; \ - icode <<= ilen; \ - ilen = 32-ilen; \ - } \ - else \ - { icode <<= n; \ - ilen -= n; \ +#define GETFLIP \ + if (n > ilen) \ + { icode <<= ilen; \ + if (fread(ipart,sizeof(uint32),1,in) != 1) \ + { EPRINTF(EPLACE,"Could not read more bits (Decode)\n"); \ + return (1); \ + } \ + Flip_Long(ipart); \ + ilen = n-ilen; \ + icode <<= ilen; \ + ilen = 32-ilen; \ + } \ + else \ + { icode <<= n; \ + ilen -= n; \ } n = 16; @@ -619,13 +631,15 @@ static void Decode(HScheme *scheme, FILE *in, char *read, int rlen) } read[j] = (char) c; } + + return (0); } // Read and decode from in, the next rlen symbols into read according to non-rchar scheme // neme, and the rchar runlength shceme reme -static void Decode_Run(HScheme *neme, HScheme *reme, FILE *in, char *read, - int rlen, int rchar) +static int Decode_Run(HScheme *neme, HScheme *reme, FILE *in, char *read, + int rlen, int rchar) { int *nlook, *nlens; int *rlook, *rlens; int nsignal, ilen; @@ -709,6 +723,8 @@ static void Decode_Run(HScheme *neme, HScheme *reme, FILE *in, char *read, read[j] = (char) c; } } + + return (0); } @@ -752,7 +768,7 @@ static void Histogram_Runs(uint64 *run, uint8 *stream, int rlen, int runChar) ********************************************************************************************/ static char *Read = NULL; // Referred by: QVentry, Read_Lines, QVcoding_Scan, -static int Rmax = -1; // Compress_Next_QVentry, Uncompress_Next_QVentry +static int Rmax = -1; // Compress_Next_QVentry static int Nline; // Referred by: QVcoding_Scan @@ -761,17 +777,21 @@ char *QVentry() // If nlines == 1 trying to read a single header, nlines = 5 trying to read 5 QV/fasta lines // for a sequence. Place line j at Read+j*Rmax and the length of every line is returned -// unless eof occurs in which case return -1. +// unless eof occurs in which case return -1. If any error occurs return -2. int Read_Lines(FILE *input, int nlines) { int i, rlen; + int tmax; + char *tread; char *other; if (Read == NULL) - { Rmax = MIN_BUFFER; - Read = (char *) Malloc(5*Rmax,"Allocating QV entry read buffer"); - if (Read == NULL) - exit (1); + { tmax = MIN_BUFFER; + tread = (char *) Malloc(5*tmax,"Allocating QV entry read buffer"); + if (tread == NULL) + EXIT(-2); + Rmax = tmax; + Read = tread; } Nline += 1; @@ -780,13 +800,15 @@ int Read_Lines(FILE *input, int nlines) rlen = strlen(Read); while (Read[rlen-1] != '\n') - { Rmax = ((int) 1.4*Rmax) + MIN_BUFFER; - Read = (char *) Realloc(Read,5*Rmax,"Reallocating QV entry read buffer"); - if (Read == NULL) - exit (1); + { tmax = ((int) 1.4*Rmax) + MIN_BUFFER; + tread = (char *) Realloc(Read,5*tmax,"Reallocating QV entry read buffer"); + if (tread == NULL) + EXIT(-2); + Rmax = tmax; + Read = tread; if (fgets(Read+rlen,Rmax-rlen,input) == NULL) - { fprintf(stderr,"Line %d: Last line does not end with a newline !\n",Nline); - exit (1); + { EPRINTF(EPLACE,"Line %d: Last line does not end with a newline !\n",Nline); + EXIT(-2); } rlen += strlen(Read+rlen); } @@ -795,12 +817,12 @@ int Read_Lines(FILE *input, int nlines) { other += Rmax; Nline += 1; if (fgets(other,Rmax,input) == NULL) - { fprintf(stderr,"Line %d: incomplete last entry of .quiv file\n",Nline); - exit (1); + { EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); + EXIT(-2); } if (rlen != (int) strlen(other)) - { fprintf(stderr,"Line %d: Lines for an entry are not the same length\n",Nline); - exit (1); + { EPRINTF(EPLACE,"Line %d: Lines for an entry are not the same length\n",Nline); + EXIT(-2); } } return (rlen-1); @@ -871,7 +893,7 @@ static int delChar, subChar; // Referred by: QVcoding_Scan, Create_QVcoding -void QVcoding_Scan(FILE *input) +int QVcoding_Scan(FILE *input) { char *slash; int rlen; @@ -900,29 +922,32 @@ void QVcoding_Scan(FILE *input) { int well, beg, end, qv; rlen = Read_Lines(input,1); + if (rlen == -2) + EXIT(1); if (rlen < 0) break; if (rlen == 0 || Read[0] != '@') - { fprintf(stderr,"Line %d: Header in quiv file is missing\n",Nline); - exit (1); + { EPRINTF(EPLACE,"Line %d: Header in quiv file is missing\n",Nline); + EXIT(1); } slash = index(Read+1,'/'); if (slash == NULL) - { fprintf(stderr,"%s: Line %d: Header line incorrectly formatted ?\n", + { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); - exit (1); + EXIT(1); } if (sscanf(slash+1,"%d/%d_%d RQ=0.%d\n",&well,&beg,&end,&qv) != 4) - { fprintf(stderr,"%s: Line %d: Header line incorrectly formatted ?\n", + { EPRINTF(EPLACE,"%s: Line %d: Header line incorrectly formatted ?\n", Prog_Name,Nline); - exit (1); + EXIT(1); } rlen = Read_Lines(input,5); if (rlen < 0) - { fprintf(stderr,"Line %d: incomplete last entry of .quiv file\n",Nline); - exit (1); + { if (rlen == -1) + EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); + EXIT(1); } Histogram_Seqs(delHist,(uint8 *) (Read),rlen); @@ -956,6 +981,8 @@ void QVcoding_Scan(FILE *input) if (subChar >= 0) Histogram_Runs( subRun,(uint8 *) (Read+4*Rmax),rlen,subChar); } + + return (0); } // Using the statistics in the global stat tables, create the Huffman schemes and write @@ -964,8 +991,16 @@ void QVcoding_Scan(FILE *input) QVcoding *Create_QVcoding(int lossy) { static QVcoding coding; - static HScheme *delScheme, *insScheme, *mrgScheme, *subScheme; - static HScheme *dRunScheme, *sRunScheme; + + HScheme *delScheme, *insScheme, *mrgScheme, *subScheme; + HScheme *dRunScheme, *sRunScheme; + + delScheme = NULL; + dRunScheme = NULL; + insScheme = NULL; + mrgScheme = NULL; + subScheme = NULL; + sRunScheme = NULL; // Check whether using a subtitution run char is a win @@ -996,6 +1031,8 @@ QVcoding *Create_QVcoding(int lossy) #define SCHEME_MACRO(meme,hist,label,bits) \ scheme = Huffman( (hist), NULL); \ + if (scheme == NULL) \ + goto error; \ if (scheme->type) \ { (meme) = Huffman( (hist), scheme); \ free(scheme); \ @@ -1077,6 +1114,21 @@ QVcoding *Create_QVcoding(int lossy) coding.flip = 0; return (&coding); + +error: + if (delScheme != NULL) + free(delScheme); + if (dRunScheme != NULL) + free(dRunScheme); + if (insScheme != NULL) + free(insScheme); + if (mrgScheme != NULL) + free(mrgScheme); + if (subScheme != NULL) + free(subScheme); + if (sRunScheme != NULL) + free(sRunScheme); + EXIT(NULL); } // Write the encoding scheme 'coding' to 'output' @@ -1131,11 +1183,15 @@ QVcoding *Read_QVcoding(FILE *input) int len; if (fread(&half,sizeof(uint16),1,input) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read flip byte (Read_QVcoding)\n"); + EXIT(NULL); + } coding.flip = (half != 0x33cc); if (fread(&half,sizeof(uint16),1,input) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read deletion char (Read_QVcoding)\n"); + EXIT(NULL); + } if (coding.flip) Flip_Short(&half); coding.delChar = half; @@ -1143,7 +1199,9 @@ QVcoding *Read_QVcoding(FILE *input) coding.delChar = -1; if (fread(&half,sizeof(uint16),1,input) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read substitution char (Read_QVcoding)\n"); + EXIT(NULL); + } if (coding.flip) Flip_Short(&half); coding.subChar = half; @@ -1153,15 +1211,19 @@ QVcoding *Read_QVcoding(FILE *input) // Read the short name common to all headers if (fread(&len,sizeof(int),1,input) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read header name length (Read_QVcoding)\n"); + EXIT(NULL); + } if (coding.flip) Flip_Long(&len); coding.prefix = (char *) Malloc(len+1,"Allocating header prefix"); if (coding.prefix == NULL) - exit (1); + EXIT(NULL); if (len > 0) { if (fread(coding.prefix,len,1,input) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read header name (Read_QVcoding)\n"); + EXIT(NULL); + } } coding.prefix[len] = '\0'; } @@ -1172,16 +1234,52 @@ QVcoding *Read_QVcoding(FILE *input) // Read the Huffman schemes used to compress the data - coding.delScheme = Read_Scheme(input); + coding.delScheme = NULL; + coding.dRunScheme = NULL; + coding.insScheme = NULL; + coding.mrgScheme = NULL; + coding.subScheme = NULL; + coding.sRunScheme = NULL; + + coding.delScheme = Read_Scheme(input); + if (coding.delScheme == NULL) + goto error; if (coding.delChar >= 0) - coding.dRunScheme = Read_Scheme(input); - coding.insScheme = Read_Scheme(input); - coding.mrgScheme = Read_Scheme(input); - coding.subScheme = Read_Scheme(input); + { coding.dRunScheme = Read_Scheme(input); + if (coding.dRunScheme == NULL) + goto error; + } + coding.insScheme = Read_Scheme(input); + if (coding.insScheme == NULL) + goto error; + coding.mrgScheme = Read_Scheme(input); + if (coding.mrgScheme == NULL) + goto error; + coding.subScheme = Read_Scheme(input); + if (coding.subScheme == NULL) + goto error; if (coding.subChar >= 0) - coding.sRunScheme = Read_Scheme(input); + { coding.sRunScheme = Read_Scheme(input); + if (coding.sRunScheme == NULL) + goto error; + } return (&coding); + +error: + if (coding.delScheme != NULL) + free(coding.delScheme); + if (coding.dRunScheme != NULL) + free(coding.dRunScheme); + if (coding.insScheme != NULL) + free(coding.insScheme); + if (coding.mrgScheme != NULL) + free(coding.mrgScheme); + if (coding.subScheme != NULL) + free(coding.subScheme); + if (coding.sRunScheme != NULL) + free(coding.sRunScheme); + EXIT(NULL); } // Free all the auxilliary storage associated with the encoding argument @@ -1205,15 +1303,16 @@ void Free_QVcoding(QVcoding *coding) * ********************************************************************************************/ -void Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy) +int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy) { int rlen, clen; // Get all 5 streams, compress each with its scheme, and output rlen = Read_Lines(input,5); if (rlen < 0) - { fprintf(stderr,"Line %d: incomplete last entry of .quiv file\n",Nline); - exit (1); + { if (rlen == -1) + EPRINTF(EPLACE,"Line %d: incomplete last entry of .quiv file\n",Nline); + EXIT (1); } if (coding->delChar < 0) @@ -1247,45 +1346,61 @@ void Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int loss else Encode_Run(coding->subScheme, coding->sRunScheme, output, (uint8 *) (Read+4*Rmax), rlen, coding->subChar); + + return (0); } -void Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen) +int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen) { int clen, tlen; // Decode each stream and write to output if (coding->delChar < 0) - { Decode(coding->delScheme, input, entry[0], rlen); + { if (Decode(coding->delScheme, input, entry[0], rlen)) + EXIT(1); clen = rlen; tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); + EXIT(1); + } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); } else - { Decode_Run(coding->delScheme, coding->dRunScheme, input, - entry[0], rlen, coding->delChar); + { if (Decode_Run(coding->delScheme, coding->dRunScheme, input, + entry[0], rlen, coding->delChar)) + EXIT(1); clen = Packed_Length(entry[0],rlen,coding->delChar); tlen = COMPRESSED_LEN(clen); if (tlen > 0) { if (fread(entry[1],tlen,1,input) != 1) - SYSTEM_ERROR + { EPRINTF(EPLACE,"Could not read deletions entry (Uncompress_Next_QVentry\n"); + EXIT(1); + } } Uncompress_Read(clen,entry[1]); Lower_Read(entry[1]); Unpack_Tag(entry[1],clen,entry[0],rlen,coding->delChar); } - Decode(coding->insScheme, input, entry[2], rlen); + if (Decode(coding->insScheme, input, entry[2], rlen)) + EXIT(1); - Decode(coding->mrgScheme, input, entry[3], rlen); + if (Decode(coding->mrgScheme, input, entry[3], rlen)) + EXIT(1); if (coding->subChar < 0) - Decode(coding->subScheme, input, entry[4], rlen); + { if (Decode(coding->subScheme, input, entry[4], rlen)) + EXIT(1); + } else - Decode_Run(coding->subScheme, coding->sRunScheme, input, - entry[4], rlen, coding->subChar); + { if (Decode_Run(coding->subScheme, coding->sRunScheme, input, + entry[4], rlen, coding->subChar)) + EXIT(1); + } + + return (0); } diff --git a/QV.h b/QV.h index 08c1801..35fbadc 100644 --- a/QV.h +++ b/QV.h @@ -52,6 +52,15 @@ #define _QV_COMPRESSOR + // The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or + // batch version of the routines in this library are compiled. In batch mode, routines + // print an error message and exit. In interactive mode, the routines place the error + // message in EPLACE (also defined in DB.h) and return an error value, typically NULL + // if the routine returns a pointer, and an unusual integer value if the routine returns + // an integer. + // Below when an error return is described, one should understand that this value is returned + // only if the routine was compiled in INTERACTIVE mode. + // A PacBio compression scheme typedef struct @@ -68,23 +77,27 @@ typedef struct } QVcoding; // Read the next nlines of input, and QVentry returns a pointer to the first line if needed. + // If end-of-input is encountered before any further input, -1 is returned. If there is + // an error than -2 is returned. Otherwise the length of the line(s) read is returned. int Read_Lines(FILE *input, int nlines); char *QVentry(); - // Read the .quiva file on input and record frequency statistics. + // Read the .quiva file on input and record frequency statistics. If there is an error + // then 1 is returned, otherwise 0. -void QVcoding_Scan(FILE *input); +int QVcoding_Scan(FILE *input); // Given QVcoding_Scan has been called at least once, create an encoding scheme based on // the accumulated statistics and return a pointer to it. The returned encoding object - // is *statically* allocated within the routine. If lossy is set then use a lossy scaling - // for the insertion and merge streams. + // is *statically allocated within the routine. If lossy is set then use a lossy scaling + // for the insertion and merge streams. If there is an error, then NULL is returned. QVcoding *Create_QVcoding(int lossy); // Read/write a coding scheme to input/output. The encoding object returned by the reader - // is *statically* allocated within the routine. + // is *statically* allocated within the routine. If an error occurs while reading then + // NULL is returned. QVcoding *Read_QVcoding(FILE *input); void Write_QVcoding(FILE *output, QVcoding *coding); @@ -95,16 +108,18 @@ void Free_QVcoding(QVcoding *coding); // Assuming the file pointer is positioned just beyond an entry header line, read the // next set of 5 QV lines, compress them according to 'coding', and output. If lossy - // is set then the scheme is a lossy one. + // is set then the scheme is a lossy one. A non-zero value is return only if an + // error occured. -void Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy); +int Compress_Next_QVentry(FILE *input, FILE *output, QVcoding *coding, int lossy); - // Assuming the input is positioned just beyond the compressed encoding of an entry header, + // Assuming the input is position just beyond the compressed encoding of an entry header, // read the set of compressed encodings for the ensuing 5 QV vectors, decompress them, // and place their decompressed values into entry which is a 5 element array of character // pointers. The parameter rlen computed from the preceeding header line, critically - // provides the length of each of the 5 vectors. + // provides the length of each of the 5 vectors. A non-zero value is return only if an + // error occured. -void Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen); +int Uncompress_Next_QVentry(FILE *input, char **entry, QVcoding *coding, int rlen); #endif // _QV_COMPRESSOR diff --git a/align.c b/align.c index 839609e..4666fe0 100644 --- a/align.c +++ b/align.c @@ -73,7 +73,6 @@ #undef DEBUG_ALIGN // Show division points of Compute_Trace #undef DEBUG_SCRIPT // Show trace additions for Compute_Trace #undef DEBUG_AWAVE // Show F/R waves of Compute_Trace -#define SMALL_BIT 100 #undef SHOW_TRACE // Show full trace for Print_Alignment @@ -102,7 +101,7 @@ Work_Data *New_Work_Data() work = (_Work_Data *) Malloc(sizeof(_Work_Data),"Allocating work data block"); if (work == NULL) - exit (1); + EXIT(NULL); work->vecmax = 0; work->vector = NULL; work->pntmax = 0; @@ -114,25 +113,43 @@ Work_Data *New_Work_Data() return ((Work_Data *) work); } -static void enlarge_vector(_Work_Data *work, int newmax) -{ work->vecmax = ((int) (newmax*1.2)) + 10000; - work->vector = Realloc(work->vector,work->vecmax,"Enlarging DP vector"); - if (work->vector == NULL) - exit (1); +static int enlarge_vector(_Work_Data *work, int newmax) +{ void *vec; + int max; + + max = ((int) (newmax*1.2)) + 10000; + vec = Realloc(work->vector,max,"Enlarging DP vector"); + if (vec == NULL) + EXIT(1); + work->vecmax = max; + work->vector = vec; + return (0); } -static void enlarge_points(_Work_Data *work, int newmax) -{ work->pntmax = ((int) (newmax*1.2)) + 10000; - work->points = Realloc(work->points,work->pntmax,"Enlarging point vector"); - if (work->points == NULL) - exit (1); +static int enlarge_points(_Work_Data *work, int newmax) +{ void *vec; + int max; + + max = ((int) (newmax*1.2)) + 10000; + vec = Realloc(work->points,max,"Enlarging point vector"); + if (vec == NULL) + EXIT(1); + work->pntmax = max; + work->points = vec; + return (0); } -static void enlarge_trace(_Work_Data *work, int newmax) -{ work->tramax = ((int) (newmax*1.2)) + 10000; - work->trace = Realloc(work->trace,work->tramax,"Enlarging trace vector"); - if (work->trace == NULL) - exit (1); +static int enlarge_trace(_Work_Data *work, int newmax) +{ void *vec; + int max; + + max = ((int) (newmax*1.2)) + 10000; + vec = Realloc(work->trace,max,"Enlarging trace vector"); + if (vec == NULL) + EXIT(1); + work->tramax = max; + work->trace = vec; + return (0); } void Free_Work_Data(Work_Data *ework) @@ -176,16 +193,6 @@ void Free_Work_Data(Work_Data *ework) static double Bias_Factor[10] = { .690, .690, .690, .690, .780, .850, .900, .933, .966, 1.000 }; - // Micro-Sat Band Parameters - -#define MICRO_SAT 20 - -static int Sat_Width[MICRO_SAT+1] = - { -1, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; - -#define SAT_LOW .75 -#define SAT_HGH 1.25 - // Adjustable paramters typedef struct @@ -234,7 +241,7 @@ Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq) spec = (_Align_Spec *) Malloc(sizeof(_Align_Spec),"Allocating alignment specification"); if (spec == NULL) - exit (1); + EXIT(NULL); spec->ave_corr = ave_corr; spec->trace_space = trace_space; @@ -248,8 +255,9 @@ Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq) match = 1.-match; bias = (int) ((match+.025)*20.-1.); if (match < .2) - { fprintf(stderr,"Warning: Base bias worse than 80/20%% !\n"); - bias = 3; + { EPRINTF(EPLACE,"Base bias worse than 80/20%% ! (New_Align_Spec)\n"); + free(spec); + EXIT(NULL); } spec->ave_path = (int) (PATH_LEN * (1. - Bias_Factor[bias] * (1. - ave_corr))); @@ -258,7 +266,9 @@ Align_Spec *New_Align_Spec(double ave_corr, int trace_space, float *freq) parms.score = (int16 *) Malloc(sizeof(int16)*(TRIM_MASK+1)*2,"Allocating trim table"); if (parms.score == NULL) - exit (1); + { free(spec); + EXIT(NULL); + } parms.table = parms.score + (TRIM_MASK+1); set_table(0,0,0,0,&parms); @@ -346,20 +356,25 @@ typedef struct int mark; } Pebble; -static int forward_wave(_Work_Data *work, _Align_Spec *spec, - Alignment *align, Path *bpath, - int mind, int maxd, int mida) +static int VectorEl = 6*sizeof(int) + sizeof(BVEC); + +static int forward_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath, + int *mind, int maxd, int mida, int minp, int maxp) { char *aseq = align->aseq; char *bseq = align->bseq; Path *apath = align->path; int hgh, low, dif; - int minp, maxp; + int vlen, vmin, vmax; int *V, *M; + int *_V, *_M; BVEC *T; + BVEC *_T; int *HA, *HB; + int *_HA, *_HB; int *NA, *NB; + int *_NA, *_NB; Pebble *cells; int avail, cmax, boff; @@ -376,17 +391,33 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, int more, morem, lasta; int aclip, bclip; - { int alen = align->alen + 1; - int blen = align->blen + 1; - int tlen = alen + blen + 1; + hgh = maxd; + low = *mind; + dif = 0; - V = ((int *) work->vector) + blen; - M = V + tlen; - HA = M + tlen; - HB = HA + tlen; - NA = HB + tlen; - NB = NA + tlen; - T = ((BVEC *) (NB + alen)) + blen; + { int span, wing; + + span = (hgh-low)+1; + vlen = work->vecmax/VectorEl; + wing = (vlen - span)/2; + vmin = low - wing; + vmax = hgh + wing; + + _V = ((int *) work->vector); + _M = _V + vlen; + _HA = _M + vlen; + _HB = _HA + vlen; + _NA = _HB + vlen; + _NB = _NA + vlen; + _T = ((BVEC *) (_NB + vlen)); + + V = _V-vmin; + M = _M-vmin; + HA = _HA-vmin; + HB = _HB-vmin; + NA = _NA-vmin; + NB = _NB-vmin; + T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; @@ -400,39 +431,6 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, /* Compute 0-wave starting from mid-line */ - hgh = maxd; - low = mind; - if (aseq == bseq) - { if (low < 0) - { int big = -low; - int sml = -hgh; - - if (big <= MICRO_SAT) - minp = low - Sat_Width[big]; - else - minp = -SAT_HGH*big; - if (sml <= MICRO_SAT) - maxp = hgh + Sat_Width[sml]; - else - maxp = -SAT_LOW*sml; - } - else - { if (low <= MICRO_SAT) - minp = low - Sat_Width[low]; - else - minp = SAT_LOW*low; - if (hgh <= MICRO_SAT) - maxp = hgh + Sat_Width[hgh]; - else - maxp = SAT_HGH*hgh; - } - } - else - { minp = -INT32_MAX; - maxp = INT32_MAX; - } - dif = 0; - more = 1; aclip = INT32_MAX; bclip = -INT32_MAX; @@ -460,7 +458,9 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } na = ((y+k)/TRACE_SPACE)*TRACE_SPACE; @@ -512,7 +512,9 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); @@ -530,7 +532,9 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout); @@ -604,6 +608,82 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, int am, ac, ap; char *a; + if (low <= vmin || hgh >= vmax) + { int span, wing; + int64 move; + int64 vd, md, had, hbd, nad, nbd, td; + + span = (hgh-low)+1; + if (.8*vlen < span) + { if (enlarge_vector(work,vlen*VectorEl)) + EXIT(1); + + move = ((void *) _V) - work->vector; + vlen = work->vecmax/VectorEl; + + _V = (int *) work->vector; + _M = _V + vlen; + _HA = _M + vlen; + _HB = _HA + vlen; + _NA = _HB + vlen; + _NB = _NA + vlen; + _T = ((BVEC *) (_NB + vlen)); + } + else + move = 0; + + wing = (vlen - span)/2; + + vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); + md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); + had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); + hbd = ((void *) (_HB+wing)) - (((void *) (HB+low)) - move); + nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); + nbd = ((void *) (_NB+wing)) - (((void *) (NB+low)) - move); + td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); + + if (vd < 0) + memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); + if (md < 0) + memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); + if (had < 0) + memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); + if (hbd < 0) + memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); + if (nad < 0) + memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); + if (nbd < 0) + memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); + if (td < 0) + memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); + + if (td > 0) + memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); + if (nbd > 0) + memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); + if (nad > 0) + memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); + if (hbd > 0) + memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); + if (had > 0) + memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); + if (md > 0) + memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); + if (vd > 0) + memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); + + vmin = low-wing; + vmax = hgh+wing; + + V = _V-vmin; + M = _M-vmin; + HA = _HA-vmin; + HB = _HB-vmin; + NA = _NA-vmin; + NB = _NB-vmin; + T = _T-vmin; + } + if (low > minp) { low -= 1; NA[low] = NA[low+1]; @@ -702,7 +782,9 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); @@ -724,7 +806,9 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout); @@ -938,27 +1022,29 @@ static int forward_wave(_Work_Data *work, _Align_Spec *spec, bpath->tlen = btlen; } - work->cells = (void *) cells; - work->celmax = cmax; - - return (low); + *mind = low; + return (0); } /*** Reverse Wave ***/ -static void reverse_wave(_Work_Data *work, _Align_Spec *spec, - Alignment *align, Path *bpath, int mind, int maxd, int mida) +static int reverse_wave(_Work_Data *work, _Align_Spec *spec, Alignment *align, Path *bpath, + int mind, int maxd, int mida, int minp, int maxp) { char *aseq = align->aseq - 1; char *bseq = align->bseq - 1; Path *apath = align->path; int hgh, low, dif; - int minp, maxp; + int vlen, vmin, vmax; int *V, *M; + int *_V, *_M; BVEC *T; + BVEC *_T; int *HA, *HB; + int *_HA, *_HB; int *NA, *NB; + int *_NA, *_NB; Pebble *cells; int avail, cmax, boff; @@ -975,17 +1061,33 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, int more, morem, lasta; int aclip, bclip; - { int alen = align->alen + 1; - int blen = align->blen + 1; - int tlen = alen + blen + 1; + hgh = maxd; + low = mind; + dif = 0; - V = ((int *) work->vector) + blen; - M = V + tlen; - HA = M + tlen; - HB = HA + tlen; - NA = HB + tlen; - NB = NA + tlen; - T = ((BVEC *) (NB + alen)) + blen; + { int span, wing; + + span = (hgh-low)+1; + vlen = work->vecmax/VectorEl; + wing = (vlen - span)/2; + vmin = low - wing; + vmax = hgh + wing; + + _V = ((int *) work->vector); + _M = _V + vlen; + _HA = _M + vlen; + _HB = _HA + vlen; + _NA = _HB + vlen; + _NB = _NA + vlen; + _T = ((BVEC *) (_NB + vlen)); + + V = _V-vmin; + M = _M-vmin; + HA = _HA-vmin; + HB = _HB-vmin; + NA = _NA-vmin; + NB = _NB-vmin; + T = _T-vmin; cells = (Pebble *) (work->cells); cmax = work->celmax; @@ -997,39 +1099,6 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, boff = 0; } - hgh = maxd; - low = mind; - if (aseq == bseq) - { if (low < 0) - { int big = -low; - int sml = -hgh; - - if (big <= MICRO_SAT) - minp = low - Sat_Width[big]; - else - minp = -SAT_HGH*big; - if (sml <= MICRO_SAT) - maxp = hgh + Sat_Width[sml]; - else - maxp = -SAT_LOW*sml; - } - else - { if (low <= MICRO_SAT) - minp = low - Sat_Width[low]; - else - minp = SAT_LOW*low; - if (hgh <= MICRO_SAT) - maxp = hgh + Sat_Width[hgh]; - else - maxp = SAT_HGH*hgh; - } - } - else - { minp = -INT32_MAX; - maxp = INT32_MAX; - } - dif = 0; - more = 1; aclip = -INT32_MAX; bclip = INT32_MAX; @@ -1057,7 +1126,9 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } na = ((y+k+TRACE_SPACE-1)/TRACE_SPACE-1)*TRACE_SPACE; @@ -1107,7 +1178,9 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,0,%d\n",avail,ha,k,na); fflush(stdout); @@ -1125,7 +1198,9 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, { cmax = ((int) (avail*1.2)) + 10000; cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble),"Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,0,%d\n",avail,hb,k,nb); fflush(stdout); @@ -1197,6 +1272,81 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, int am, ac, ap; char *a; + if (low <= vmin || hgh >= vmax) + { int span, wing; + int64 move, vd, md, had, hbd, nad, nbd, td; + + span = (hgh-low)+1; + if (.8*vlen < span) + { if (enlarge_vector(work,vlen*VectorEl)) + EXIT(1); + + move = ((void *) _V) - work->vector; + vlen = work->vecmax/VectorEl; + + _V = (int *) work->vector; + _M = _V + vlen; + _HA = _M + vlen; + _HB = _HA + vlen; + _NA = _HB + vlen; + _NB = _NA + vlen; + _T = ((BVEC *) (_NB + vlen)); + } + else + move = 0; + + wing = (vlen - span)/2; + + vd = ((void *) ( _V+wing)) - (((void *) ( V+low)) - move); + md = ((void *) ( _M+wing)) - (((void *) ( M+low)) - move); + had = ((void *) (_HA+wing)) - (((void *) (HA+low)) - move); + hbd = ((void *) (_HB+wing)) - (((void *) (HB+low)) - move); + nad = ((void *) (_NA+wing)) - (((void *) (NA+low)) - move); + nbd = ((void *) (_NB+wing)) - (((void *) (NB+low)) - move); + td = ((void *) ( _T+wing)) - (((void *) ( T+low)) - move); + + if (vd < 0) + memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); + if (md < 0) + memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); + if (had < 0) + memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); + if (hbd < 0) + memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); + if (nad < 0) + memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); + if (nbd < 0) + memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); + if (td < 0) + memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); + + if (td > 0) + memmove( _T+wing, ((void *) ( T+low)) - move, span*sizeof(BVEC)); + if (nbd > 0) + memmove(_NB+wing, ((void *) (NB+low)) - move, span*sizeof(int)); + if (nad > 0) + memmove(_NA+wing, ((void *) (NA+low)) - move, span*sizeof(int)); + if (hbd > 0) + memmove(_HB+wing, ((void *) (HB+low)) - move, span*sizeof(int)); + if (had > 0) + memmove(_HA+wing, ((void *) (HA+low)) - move, span*sizeof(int)); + if (md > 0) + memmove( _M+wing, ((void *) ( M+low)) - move, span*sizeof(int)); + if (vd > 0) + memmove( _V+wing, ((void *) ( V+low)) - move, span*sizeof(int)); + + vmin = low-wing; + vmax = hgh+wing; + + V = _V-vmin; + M = _M-vmin; + HA = _HA-vmin; + HB = _HB-vmin; + NA = _NA-vmin; + NB = _NB-vmin; + T = _T-vmin; + } + if (low > minp) { low -= 1; NA[low] = NA[low+1]; @@ -1295,7 +1445,9 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" A %d: %d,%d,%d,%d\n",avail,ha,k,dif,NA[k]); fflush(stdout); @@ -1316,7 +1468,9 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, cells = (Pebble *) Realloc(cells,cmax*sizeof(Pebble), "Reallocating trace cells"); if (cells == NULL) - exit (1); + EXIT(1); + work->celmax = cmax; + work->cells = (void *) cells; } #ifdef SHOW_TPS printf(" B %d: %d,%d,%d,%d\n",avail,hb,k,dif,NB[k]); fflush(stdout); @@ -1586,8 +1740,7 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, bpath->trace = btrace + btlen; } - work->cells = (void *) cells; - work->celmax = cmax; + return (0); } @@ -1596,21 +1749,27 @@ static void reverse_wave(_Work_Data *work, _Align_Spec *spec, */ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec, - int xlow, int xhgh, int ycnt) + int low, int hgh, int anti, int lbord, int hbord) { _Work_Data *work = ( _Work_Data *) ework; _Align_Spec *spec = (_Align_Spec *) espec; Path *apath, *bpath; - int alen, blen; + int minp, maxp; + int selfie; - alen = align->alen; - blen = align->blen; + { int alen, blen; + int maxtp, wsize; - { int maxtp, wsize; + alen = align->alen; + blen = align->blen; - wsize = (6*sizeof(int) + sizeof(BVEC))*(alen+blen+3); + if (hgh-low >= 7500) + wsize = VectorEl*(hgh-low+1); + else + wsize = VectorEl*10000; if (wsize >= work->vecmax) - enlarge_vector(work,wsize); + if (enlarge_vector(work,wsize)) + EXIT(NULL); if (alen < blen) maxtp = 2*(blen/spec->trace_space+2); @@ -1618,7 +1777,8 @@ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec, maxtp = 2*(alen/spec->trace_space+2); wsize = 4*maxtp*sizeof(uint16) + sizeof(Path); if (wsize > work->pntmax) - enlarge_points(work,wsize); + if (enlarge_points(work,wsize)) + EXIT(NULL); apath = align->path; bpath = (Path *) work->points; @@ -1631,30 +1791,41 @@ Path *Local_Alignment(Alignment *align, Work_Data *ework, Align_Spec *espec, printf("\n"); #endif - { int l, h, a; + selfie = (align->aseq == align->bseq); + + if (lbord < 0) + { if (selfie && low >= 0) + minp = 1; + else + minp = -INT32_MAX; + } + else + minp = low-lbord; + if (hbord < 0) + { if (selfie && hgh <= 0) + maxp = -1; + else + maxp = INT32_MAX; + } + else + maxp = hgh+hbord; - l = xlow-ycnt; - h = xhgh-ycnt; - a = (xlow+xhgh)/2+ycnt; - if (a < h) - a = h; - if (a < -l) - a = -l; - if (a > 2*alen - h) - a = 2*alen-h; - if (a > 2*blen + l) - a = 2*blen+l; + if (forward_wave(work,spec,align,bpath,&low,hgh,anti,minp,maxp)) + EXIT(NULL); - l = forward_wave(work,spec,align,bpath,l,h,a); #ifdef DEBUG_PASSES - printf("F1 (%d-%d,%d) => (%d,%d) %d\n",xlow,xhgh,ycnt,apath->aepos,apath->bepos,apath->diffs); + printf("F1 (%d,%d) ~ %d => (%d,%d) %d\n", + (2*anti+(low+hgh))/4,(anti-(low+hgh))/4,hgh-low, + apath->aepos,apath->bepos,apath->diffs); #endif - reverse_wave(work,spec,align,bpath,l,l,a); + if (reverse_wave(work,spec,align,bpath,low,low,anti,minp,maxp)) + EXIT(NULL); + #ifdef DEBUG_PASSES - printf("R1 (%d,%d) => (%d,%d) %d\n",l,ycnt,apath->abpos,apath->bbpos,apath->diffs); + printf("R1 (%d,%d) => (%d,%d) %d\n", + (anti+low)/2,(anti-low)/2,apath->abpos,apath->bbpos,apath->diffs); #endif - } if (COMP(align->flags)) { uint16 *trace = (uint16 *) bpath->trace; @@ -1802,7 +1973,7 @@ int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname) if (((ovl->path.aepos-1)/tspace - ovl->path.abpos/tspace)*2 != ovl->path.tlen-2) { if (verbose) - fprintf(stderr," %s: Wrong number of trace points\n",fname); + EPRINTF(EPLACE," %s: Wrong number of trace points\n",fname); return (1); } p = ovl->path.bbpos; @@ -1818,7 +1989,7 @@ int Check_Trace_Points(Overlap *ovl, int tspace, int verbose, char *fname) } if (p != ovl->path.bepos) { if (verbose) - fprintf(stderr," %s: Trace point sum != aligned interval\n",fname); + EPRINTF(EPLACE," %s: Trace point sum != aligned interval\n",fname); return (1); } return (0); @@ -1921,8 +2092,8 @@ void Complement_Seq(char *aseq, int len) static char ToL[8] = { 'a', 'c', 'g', 't', '.', '[', ']', '-' }; static char ToU[8] = { 'A', 'C', 'G', 'T', '.', '[', ']', '-' }; -void Print_Alignment(FILE *file, Alignment *align, Work_Data *ework, - int indent, int width, int border, int upper, int coord) +int Print_Alignment(FILE *file, Alignment *align, Work_Data *ework, + int indent, int width, int border, int upper, int coord) { _Work_Data *work = (_Work_Data *) ework; int *trace = align->path->trace; int tlen = align->path->tlen; @@ -1937,7 +2108,7 @@ void Print_Alignment(FILE *file, Alignment *align, Work_Data *ework, int match, diff; char *N2A; - if (trace == NULL) return; + if (trace == NULL) return (0); #ifdef SHOW_TRACE fprintf(file,"\nTrace:\n"); @@ -1947,7 +2118,8 @@ void Print_Alignment(FILE *file, Alignment *align, Work_Data *ework, o = sizeof(char)*3*(width+1); if (o > work->vecmax) - enlarge_vector(work,o); + if (enlarge_vector(work,o)) + EXIT(1); if (upper) N2A = ToU; @@ -2157,10 +2329,11 @@ void Print_Alignment(FILE *file, Alignment *align, Work_Data *ework, fprintf(file,"\n"); fflush(file); + return (0); } -void Print_Reference(FILE *file, Alignment *align, Work_Data *ework, - int indent, int block, int border, int upper, int coord) +int Print_Reference(FILE *file, Alignment *align, Work_Data *ework, + int indent, int block, int border, int upper, int coord) { _Work_Data *work = (_Work_Data *) ework; int *trace = align->path->trace; int tlen = align->path->tlen; @@ -2176,7 +2349,7 @@ void Print_Reference(FILE *file, Alignment *align, Work_Data *ework, char *N2A; int vmax; - if (trace == NULL) return; + if (trace == NULL) return (0); #ifdef SHOW_TRACE fprintf(file,"\nTrace:\n"); @@ -2187,7 +2360,8 @@ void Print_Reference(FILE *file, Alignment *align, Work_Data *ework, vmax = work->vecmax/3; o = sizeof(char)*6*(block+1); if (o > vmax) - { enlarge_vector(work,3*o); + { if (enlarge_vector(work,3*o)) + EXIT(1); vmax = work->vecmax/3; } @@ -2245,7 +2419,8 @@ void Print_Reference(FILE *file, Alignment *align, Work_Data *ework, Bbuf[o] = N2A[v]; \ o += 1; \ if (o >= vmax) \ - { enlarge_vector(work,3*o); \ + { if (enlarge_vector(work,3*o)) \ + EXIT(1); \ vmax = work->vecmax/3; \ memmove(work->vector+2*vmax,Dbuf,o); \ memmove(work->vector+vmax,Bbuf,o); \ @@ -2408,6 +2583,7 @@ void Print_Reference(FILE *file, Alignment *align, Work_Data *ework, fprintf(file,"\n"); fflush(file); + return (0); } /* Print an ASCII representation of the overlap in align between fragments @@ -2773,7 +2949,7 @@ static int dandc_nd(char *A, int M, char *B, int N, Trace_Waves *wave) } -static void Compute_Trace_ND_ALL(Alignment *align, Work_Data *ework) +static int Compute_Trace_ND_ALL(Alignment *align, Work_Data *ework) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; @@ -2792,13 +2968,15 @@ static void Compute_Trace_ND_ALL(Alignment *align, Work_Data *ework) L = asub; L *= sizeof(int); if (L > work->tramax) - enlarge_trace(work,L); + if (enlarge_trace(work,L)) + EXIT(1); trace = wave.Stop = ((int *) work->trace); D = 2*(path->diffs + 4)*sizeof(int); if (D > work->vecmax) - enlarge_vector(work,D); + if (enlarge_vector(work,D)) + EXIT(1); D = (path->diffs+3)/2; wave.VF = ((int *) work->vector) + (D+1); @@ -2811,6 +2989,7 @@ static void Compute_Trace_ND_ALL(Alignment *align, Work_Data *ework) align->bseq+path->bbpos,path->bepos-path->bbpos,&wave); path->trace = trace; path->tlen = wave.Stop - trace; + return (0); } @@ -2840,7 +3019,8 @@ static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave) { int *F0, *F1, *F2; int *HF; - int low, hgh, pos; + int low, hgh; + int posl, posh; #ifdef DEBUG_ALIGN printf("\n%*s BASE %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N); @@ -2862,10 +3042,19 @@ static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave) { low = del; hgh = 0; } + + posl = -INT32_MAX; + posh = INT32_MAX; if (wave->Aabs == wave->Babs) - pos = B-A; - else - pos = -INT32_MAX; + { if (B == A) + { EPRINTF(EPLACE,"Error: self comparison starts on diagonal 0 (Compute_Trace)\n"); + EXIT(-1); + } + else if (B < A) + posl = (B-A)+1; + else + posh = (B-A)-1; + } F1 = PVF[-2]; F0 = PVF[-1]; @@ -2888,10 +3077,10 @@ static int iter_np(char *A, int M, char *B, int N, Trace_Waves *wave) HF = PHF[D]; if ((D & 0x1) == 0) - { hgh += 1; - low -= 1; - if (low <= pos) - low += 1; + { if (low > posl) + low -= 1; + if (hgh < posh) + hgh += 1; } F0[hgh+1] = F0[low-1] = -2; @@ -3086,7 +3275,8 @@ static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave) { int *F0, *F1, *F2; int *HF; - int low, hgh, pos; + int low, hgh; + int posl, posh; #ifdef DEBUG_ALIGN printf("\n%*s BASE %ld,%ld: %d vs %d\n",depth,"",A-wave->Aabs,B-wave->Babs,M,N); @@ -3108,10 +3298,19 @@ static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave) { low = del; hgh = 0; } + + posl = -INT32_MAX; + posh = INT32_MAX; if (wave->Aabs == wave->Babs) - pos = B-A; - else - pos = -INT32_MAX; + { if (B == A) + { EPRINTF(EPLACE,"Error: self comparison starts on diagonal 0 (Compute_Trace)\n"); + EXIT(1); + } + else if (B < A) + posl = (B-A)+1; + else + posh = (B-A)-1; + } F1 = PVF[-2]; F0 = PVF[-1]; @@ -3134,10 +3333,10 @@ static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave) HF = PHF[D]; if ((D & 0x1) == 0) - { hgh += 1; - low -= 1; - if (low <= pos) - low += 1; + { if (low > posl) + low -= 1; + if (hgh < posh) + hgh += 1; } F0[hgh+1] = F0[low-1] = -2; @@ -3246,7 +3445,7 @@ static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave) wave->mida = (A-wave->Aabs) + k + PVF[D][k]; } - return (1); + return (0); } @@ -3256,13 +3455,13 @@ static int middle_np(char *A, int M, char *B, int N, Trace_Waves *wave) * * \****************************************************************************************/ -void Compute_Trace_ALL(Alignment *align, Work_Data *ework) +int Compute_Trace_ALL(Alignment *align, Work_Data *ework) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; Path *path; char *aseq, *bseq; - int M, N; + int M, N, D; path = align->path; aseq = align->aseq; @@ -3282,19 +3481,19 @@ void Compute_Trace_ALL(Alignment *align, Work_Data *ework) s = M; s *= sizeof(int); if (s > work->tramax) - enlarge_trace(work,s); + if (enlarge_trace(work,s)) + EXIT(1); dmax = path->diffs - abs(M-N); s = (dmax+3)*2*((M+N+3)*sizeof(int) + sizeof(int *)); if (s > 256000000) - { Compute_Trace_ND_ALL(align,ework); - return; - } + return (Compute_Trace_ND_ALL(align,ework)); if (s > work->vecmax) - enlarge_vector(work,s); + if (enlarge_vector(work,s)) + EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); @@ -3312,12 +3511,17 @@ void Compute_Trace_ALL(Alignment *align, Work_Data *ework) wave.Aabs = aseq; wave.Babs = bseq; - path->diffs = iter_np(aseq+path->abpos,M,bseq+path->bbpos,N,&wave); + D = iter_np(aseq+path->abpos,M,bseq+path->bbpos,N,&wave); + if (D < 0) + EXIT(1); + path->diffs = D; path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); + + return (0); } -void Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing) +int Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; @@ -3348,7 +3552,8 @@ void Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing) else s = M*sizeof(int); if (s > work->tramax) - enlarge_trace(work,s); + if (enlarge_trace(work,s)) + EXIT(1); nmax = 0; dmax = 0; @@ -3366,7 +3571,8 @@ void Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing) s = (dmax+3)*2*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) - enlarge_vector(work,s); + if (enlarge_vector(work,s)) + EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); @@ -3384,7 +3590,7 @@ void Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing) wave.Aabs = aseq; wave.Babs = bseq; - { int i; + { int i, d; diffs = 0; ab = path->abpos; @@ -3394,21 +3600,29 @@ void Compute_Trace_PTS(Alignment *align, Work_Data *ework, int trace_spacing) for (i = 1; i < tlen; i += 2) { ae = ae + trace_spacing; be = bb + points[i]; - diffs += iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave); + d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave); + if (d < 0) + EXIT(1); + diffs += d; ab = ae; bb = be; } ae = path->aepos; be = path->bepos; - diffs += iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave); + d = iter_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave); + if (d < 0) + EXIT(1); + diffs += d; } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; + + return (0); } -void Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing) +int Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing) { _Work_Data *work = (_Work_Data *) ework; Trace_Waves wave; @@ -3439,7 +3653,8 @@ void Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing) else s = M*sizeof(int); if (s > work->tramax) - enlarge_trace(work,s); + if (enlarge_trace(work,s)) + EXIT(1); nmax = 0; dmax = 0; @@ -3457,7 +3672,8 @@ void Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing) s = (dmax+3)*4*((trace_spacing+nmax+3)*sizeof(int) + sizeof(int *)); if (s > work->vecmax) - enlarge_vector(work,s); + if (enlarge_vector(work,s)) + EXIT(1); wave.PVF = PVF = ((int **) (work->vector)) + 2; wave.PHF = PHF = PVF + (dmax+3); @@ -3475,7 +3691,7 @@ void Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing) wave.Aabs = aseq; wave.Babs = bseq; - { int i; + { int i, d; int as, bs; int af, bf; @@ -3488,28 +3704,42 @@ void Compute_Trace_MID(Alignment *align, Work_Data *ework, int trace_spacing) { ae = ae + trace_spacing; be = bb + points[i]; if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave)) - { af = wave.mida; - bf = wave.midb; - diffs += iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave); - ab = ae; - bb = be; - as = af; - bs = bf; - } - } - ae = path->aepos; - be = path->bepos; - if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave)) - { af = wave.mida; + EXIT(1); + af = wave.mida; bf = wave.midb; - diffs += iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave); + d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave); + if (d < 0) + EXIT(1); + diffs += d; + ab = ae; + bb = be; as = af; bs = bf; } - diffs += iter_np(aseq+af,ae-as,bseq+bf,be-bs,&wave); + + ae = path->aepos; + be = path->bepos; + + if (middle_np(aseq+ab,ae-ab,bseq+bb,be-bb,&wave)) + EXIT(1); + af = wave.mida; + bf = wave.midb; + d = iter_np(aseq+as,af-as,bseq+bs,bf-bs,&wave); + if (d < 0) + EXIT(1); + diffs += d; + as = af; + bs = bf; + + d += iter_np(aseq+af,ae-as,bseq+bf,be-bs,&wave); + if (d < 0) + EXIT(1); + diffs += d; } path->trace = work->trace; path->tlen = wave.Stop - ((int *) path->trace); path->diffs = diffs; + + return (0); } diff --git a/align.h b/align.h index bc95c12..110cfcc 100644 --- a/align.h +++ b/align.h @@ -58,6 +58,20 @@ #define _A_MODULE +/*** INTERACTIVE vs BATCH version + + The defined constant INTERACTIVE (set in DB.h) determines whether an interactive or + batch version of the routines in this library are compiled. In batch mode, routines + print an error message and exit. In interactive mode, the routines place the error + message in EPLACE (also defined in DB.h) and return an error value, typically NULL + if the routine returns a pointer, and an unusual integer value if the routine returns + an integer. + Below when an error return is described, one should understand that this value is returned + only if the routine was compiled in INTERACTIVE mode. + +***/ + + /*** PATH ABSTRACTION: Coordinates are *between* characters where 0 is the tick just before the first char, @@ -157,8 +171,8 @@ void Complement_Seq(char *a, int n); storage that is more efficiently reused with each call, rather than being allocated anew with each call. Each *thread* can create a Work_Data object with New_Work_Data and this object holds and retains the working storage for routines of this module between calls - to the routines. Free_Work_Data frees a Work_Data object and all working storage - held by it. + to the routines. If enough memory for a Work_Data is not available then NULL is returned. + Free_Work_Data frees a Work_Data object and all working storage held by it. */ typedef void Work_Data; @@ -182,7 +196,8 @@ void Complement_Seq(char *a, int n); If an alignment cannot reach the boundary of the d.p. matrix with this condition (i.e. overlap), then the last/first 30 columns of the alignment are guaranteed to be suffix/prefix positive at correlation ave_corr * g(freq) where g is an empirically - measured function that increases from 1 as the entropy of freq decreases. + measured function that increases from 1 as the entropy of freq decreases. If memory is + unavailable or the freq distribution is too skewed then NULL is returned. You can get back the original parameters used to create an Align_Spec with the simple utility functions below. @@ -199,19 +214,27 @@ void Complement_Seq(char *a, int n); float *Base_Frequencies (Align_Spec *spec); /* Local_Alignment finds the longest significant local alignment between the sequences in - 'align' subject to the alignment criterion given by the Align_Spec 'spec' that passes - through one of the points '(xlow-xhgh,y)' within the underlying dynamic programming matrix. - The path record of 'align' has its 'trace' filled from the point of view of an overlap between - the aread and the bread. In addition a Path record from the point of view of the bread - versus the aread is returned by the function, with this Path's 'trace' filled in - appropriately. The space for the returned path and the two 'trace's are in the - working storage supplied by the Work_Data packet and this space is reused with each call, - so if one wants to retain the bread-path and the two trace point sequences, then they - must be copied to user-allocated storage before calling the routine again. + 'align' subject to: + + (a) the alignment criterion given by the Align_Spec 'spec', + (b) it passes through one of the points (anti+k)/2,(anti-k)/2 for k in [low,hgh] within + the underlying dynamic programming matrix (i.e. the points on diagonals low to hgh + on anti-diagonal anti or anti-1 (depending on whether the diagonal is odd or even)), + (c) if lbord >= 0, then the alignment is always above diagonal low-lbord, and + (d) if hbord >= 0, then the alignment is always below diagonal hgh+hbord. + + The path record of 'align' has its 'trace' filled from the point of view of an overlap + between the aread and the bread. In addition a Path record from the point of view of the + bread versus the aread is returned by the function, with this Path's 'trace' filled in + appropriately. The space for the returned path and the two 'trace's are in the working + storage supplied by the Work_Data packet and this space is reused with each call, so if + one wants to retain the bread-path and the two trace point sequences, then they must be + copied to user-allocated storage before calling the routine again. NULL is returned in + the event of an error. */ Path *Local_Alignment(Alignment *align, Work_Data *work, Align_Spec *spec, - int xlow, int xhgh, int y); + int low, int hgh, int anti, int lbord, int hbord); /* Given a legitimate Alignment object, Compute_Trace_X computes an exact trace for the alignment. If 'path.trace' is non-NULL, then it is assumed to be a sequence of pass-through points @@ -228,12 +251,13 @@ void Complement_Seq(char *a, int n); but at the tradeoff of not necessarily being optimal as pass-through points are not all perfect. Compute_Trace_MID computes a trace by computing the trace between the mid-points of alignments between two adjacent pairs of pass through points. It is generally twice as - slow as Compute_Trace_PTS, but it produces nearer optimal alignments. + slow as Compute_Trace_PTS, but it produces nearer optimal alignments. All these routines + return 1 if an error occurred and 0 otherwise. */ - void Compute_Trace_ALL(Alignment *align, Work_Data *work); - void Compute_Trace_PTS(Alignment *align, Work_Data *work, int trace_spacing); - void Compute_Trace_MID(Alignment *align, Work_Data *work, int trace_spacing); + int Compute_Trace_ALL(Alignment *align, Work_Data *work); + int Compute_Trace_PTS(Alignment *align, Work_Data *work, int trace_spacing); + int Compute_Trace_MID(Alignment *align, Work_Data *work, int trace_spacing); /* Alignment_Cartoon prints an ASCII representation of the overlap relationhip between the two reads of 'align' to the given 'file' indented by 'indent' space. Coord controls @@ -253,6 +277,8 @@ void Complement_Seq(char *a, int n); in segments of different lengths, but is convenient when looking at two alignments involving A as segments are guaranteed to cover the same interval of A in a segment. + Both Print routines return 1 if an error occurred (not enough memory), and 0 otherwise. + Flip_Alignment modifies align so the roles of A and B are reversed. If full is off then the trace is ignored, otherwise the trace must be to a full alignment trace and this trace is also appropriately inverted. @@ -260,10 +286,10 @@ void Complement_Seq(char *a, int n); void Alignment_Cartoon(FILE *file, Alignment *align, int indent, int coord); - void Print_Alignment(FILE *file, Alignment *align, Work_Data *work, + int Print_Alignment(FILE *file, Alignment *align, Work_Data *work, int indent, int width, int border, int upper, int coord); - void Print_Reference(FILE *file, Alignment *align, Work_Data *work, + int Print_Reference(FILE *file, Alignment *align, Work_Data *work, int indent, int block, int border, int upper, int coord); void Flip_Alignment(Alignment *align, int full); diff --git a/filter.c b/filter.c index 3205049..db80348 100644 --- a/filter.c +++ b/filter.c @@ -67,13 +67,15 @@ // <= 4 ^ -(kmer-MAX_BIAS) #define MAXGRAM 10000 // Cap on k-mer count histogram (in count_thread, merge_thread) +#define PANEL_SIZE 50000 // Size to break up very long A-reads +#define PANEL_OVERLAP 10000 // Overlap of A-panels + #undef TEST_LSORT #undef TEST_KSORT #undef TEST_PAIRS #undef TEST_CSORT #define HOW_MANY 3000 // Print first HOW_MANY items for each of the TEST options above -#define TEST_HIT // When off, just tuple filtering alone, no check #undef TEST_GATHER #undef SHOW_OVERLAP // Show the cartoon #undef SHOW_ALIGNMENT // Show the alignment @@ -103,7 +105,7 @@ typedef struct } KmerPos; typedef struct - { int bpos; + { int diag; int apos; int aread; int bread; @@ -119,7 +121,7 @@ typedef struct typedef struct { int apos; - int bpos; + int diag; int bread; int aread; } SeedPair; @@ -1069,7 +1071,7 @@ static void *merge_thread(void *arg) { hits[nhits].bread = bsort[b].read; hits[nhits].aread = ar; hits[nhits].apos = ap; - hits[nhits].bpos = bsort[b].rpos; + hits[nhits].diag = ap - bsort[b].rpos; nhits += 1; } } @@ -1086,7 +1088,7 @@ static void *merge_thread(void *arg) { hits[nhits].bread = bsort[b].read; hits[nhits].aread = ar; hits[nhits].apos = ap; - hits[nhits].bpos = bsort[b].rpos; + hits[nhits].diag = ap - bsort[b].rpos; nhits += 1; } } @@ -1122,7 +1124,7 @@ static void *merge_thread(void *arg) { hits[nhits].bread = bsort[b].read; hits[nhits].aread = asort[a].read; hits[nhits].apos = ap; - hits[nhits].bpos = bsort[b].rpos; + hits[nhits].diag = ap - bsort[b].rpos; nhits += 1; } } @@ -1137,6 +1139,37 @@ static void *merge_thread(void *arg) } +void Diagonal_Span(Path *path, int tspace, int *mind, int *maxd) +{ uint16 *points; + int i, tlen; + int dd, low, hgh; + + points = (uint16 *) path->trace; + tlen = path->tlen; + + dd = path->bbpos - path->abpos; + low = hgh = dd; + + dd = path->bepos - path->aepos; + if (dd < low) + low = dd; + else if (dd > hgh) + hgh = dd; + + dd = path->bbpos - (path->abpos/tspace)*tspace; + tlen -= 2; + for (i = 1; i < tlen; i += 2) + { dd += points[i]-tspace; + if (dd < low) + low = dd; + else if (dd > hgh) + hgh = dd; + } + + *mind = (low >> Binshift)-1; + *maxd = (hgh >> Binshift)+1; +} + static HITS_DB *MR_ablock; static HITS_DB *MR_bblock; static SeedPair *MR_hits; @@ -1147,6 +1180,7 @@ typedef struct { int64 beg, end; int *score; int *lastp; + int *lasta; Work_Data *work; Align_Spec *aspec; FILE *ofile1; @@ -1164,22 +1198,23 @@ static void *report_thread(void *arg) HITS_READ *aread = MR_ablock->reads; HITS_READ *bread = MR_bblock->reads; int *score = data->score; + int *scorp = data->score + 1; + int *scorm = data->score - 1; int *lastp = data->lastp; -#ifdef TEST_HIT + int *lasta = data->lasta; Work_Data *work = data->work; -#endif Align_Spec *aspec = data->aspec; FILE *ofile1 = data->ofile1; FILE *ofile2 = data->ofile2; int afirst = MR_ablock->tfirst; int bfirst = MR_bblock->tfirst; + int maxdiag = ( MR_ablock->maxlen >> Binshift); + int mindiag = (-MR_bblock->maxlen >> Binshift); Overlap _ovla, *ovla = &_ovla; Overlap _ovlb, *ovlb = &_ovlb; Alignment _align, *align = &_align; -#ifdef TEST_HIT Path *apath; -#endif Path *bpath = &(ovlb->path); int64 nfilt = 0; int64 ahits = 0; @@ -1188,9 +1223,8 @@ static void *report_thread(void *arg) Double *hitc; int minhit; - - uint64 p, q; - int64 g, h, f, end; + uint64 cpair, npair; + int64 nidx, eidx; // In ovl and align roles of A and B are reversed, as the B sequence must be the // complemented sequence !! @@ -1217,60 +1251,77 @@ static void *report_thread(void *arg) minhit = (Hitmin-1)/Kmer + 1; hitc = hitd + (minhit-1); - end = data->end - minhit; - h = data->beg; - for (p = hitd[h].p2; h < end; p = q) - if (hitc[h].p2 != p) - { h += 1; - while ((q = hitd[h].p2) == p) - h += 1; + eidx = data->end - minhit; + nidx = data->beg; + for (cpair = hitd[nidx].p2; nidx < eidx; cpair = npair) + if (hitc[nidx].p2 != cpair) + { nidx += 1; + while ((npair = hitd[nidx].p2) == cpair) + nidx += 1; } else - { int apos, bpos, diag; - int lasta, lastd; - int ar, br; + { int ar, br; int alen, blen; + int setaln, amark, amark2; + int apos, bpos, diag; + int64 lidx, sidx; + int64 f, h2; - ar = hits[h].aread; - br = hits[h].bread; + ar = hits[nidx].aread; + br = hits[nidx].bread; alen = aread[ar].rlen; blen = bread[br].rlen; if (alen < HGAP_MIN && blen < HGAP_MIN) - { do - q = hitd[++h].p2; - while (q == p); + { nidx += 1; + while ((npair = hitd[nidx].p2) == cpair) + nidx += 1; continue; } - - g = h; - do - { bpos = hits[h].bpos; - apos = hits[h].apos; - diag = (apos - bpos) >> Binshift; - if (apos - lastp[diag] >= Kmer) - score[diag] += Kmer; - else - score[diag] += apos - lastp[diag]; - lastp[diag] = apos; - q = hitd[++h].p2; - } - while (q == p); #ifdef TEST_GATHER - printf("%5d vs %5d : %3lld",br+bfirst,ar+afirst,h-g); + printf("%5d vs %5d : %5d x %5d\n",br+bfirst,ar+afirst,blen,alen); +#endif + setaln = 1; + amark2 = 0; + for (sidx = nidx; hitd[nidx].p2 == cpair; nidx = h2) + { amark = amark2 + PANEL_SIZE; + amark2 = amark - PANEL_OVERLAP; + + h2 = lidx = nidx; + do + { apos = hits[nidx].apos; + npair = hitd[++nidx].p2; + if (apos <= amark2) + h2 = nidx; + } + while (npair == cpair && apos <= amark); + + if (nidx-lidx < minhit) continue; + + for (f = lidx; f < nidx; f++) + { apos = hits[f].apos; + diag = hits[f].diag >> Binshift; + if (apos - lastp[diag] >= Kmer) + score[diag] += Kmer; + else + score[diag] += apos - lastp[diag]; + lastp[diag] = apos; + } + +#ifdef TEST_GATHER + printf(" %6lld upto %6d",nidx-lidx,amark); #endif - lasta = -1; - lastd = -(Kmer+1); - for (f = g; f < h; f++) - { bpos = hits[f].bpos; - apos = hits[f].apos; - diag = apos - bpos; - if ((lastd != diag && apos >= lasta) || (lastd == diag && apos > lasta+Kmer)) - { diag >>= Binshift; - if (score[diag] + score[diag+1] >= Hitmin || score[diag] + score[diag-1] >= Hitmin) - { if (lasta < 0) - { align->bseq = aseq + aread[ar].boff; + for (f = lidx; f < nidx; f++) + { apos = hits[f].apos; + diag = hits[f].diag; + bpos = apos - diag; + diag = diag >> Binshift; + if (apos > lasta[diag] && + (score[diag] + scorp[diag] >= Hitmin || score[diag] + scorm[diag] >= Hitmin)) + { if (setaln) + { setaln = 0; + align->bseq = aseq + aread[ar].boff; align->aseq = bseq + bread[br].boff; align->blen = alen; align->alen = blen; @@ -1280,21 +1331,35 @@ static void *report_thread(void *arg) #ifdef TEST_GATHER else printf("\n "); - if (score[diag-1] > score[diag+1]) + + if (scorm[diag] > scorp[diag]) printf(" %5d.. x %5d.. %5d (%3d)", - bpos,apos,apos-bpos,score[diag]+score[diag-1]); + bpos,apos,apos-bpos,score[diag]+scorm[diag]); else printf(" %5d.. x %5d.. %5d (%3d)", - bpos,apos,apos-bpos,score[diag]+score[diag+1]); + bpos,apos,apos-bpos,score[diag]+scorp[diag]); #endif nfilt += 1; -#ifdef TEST_HIT + apath = Local_Alignment(align,work,aspec,bpos-apos,bpos-apos,bpos+apos,-1,-1); - apath = Local_Alignment(align,work,aspec,bpos,bpos,apos); - lasta = bpath->bepos; - lastd = lasta - bpath->aepos; - if ((apath->aepos - apath->abpos) + (apath->bepos - apath->bbpos) >= MINOVER) + { int low, hgh, be; + + Diagonal_Span(bpath,tspace,&low,&hgh); + if (diag < low) + low = diag; + else if (diag > hgh) + hgh = diag; + be = bpath->bepos; + for (diag = low; diag <= hgh; diag++) + if (be > lasta[diag]) + lasta[diag] = be; +#ifdef TEST_GATHER + printf(" %d - %d @ %d",low,hgh,bpath->bepos); +#endif + } + + if ((apath->aepos-apath->abpos) + (apath->bepos-apath->bbpos) >= MINOVER) { ovla->path = *apath; if (small) { Compress_TraceTo8(ovla); @@ -1331,26 +1396,34 @@ static void *report_thread(void *arg) #ifdef TEST_GATHER else printf(" No alignment %d", - ((apath->aepos - apath->abpos) + (apath->bepos - apath->bbpos))/2); + ((apath->aepos-apath->abpos) + (apath->bepos-apath->bbpos))/2); #endif - -#else // TEST_HIT - - break; - -#endif // TEST_HIT } } - } + + for (f = lidx; f < nidx; f++) + { diag = hits[f].diag >> Binshift; + score[diag] = lastp[diag] = 0; + } #ifdef TEST_GATHER - printf("\n"); + printf("\n"); #endif + } + + for (f = sidx; f < nidx; f++) + { int d; - for (; g < h; g++) - { bpos = hits[g].bpos; - apos = hits[g].apos; - diag = (apos - bpos) >> Binshift; - score[diag] = lastp[diag] = 0; + diag = hits[f].diag >> Binshift; + for (d = diag; d <= maxdiag; d++) + if (lasta[d] == 0) + break; + else + lasta[d] = 0; + for (d = diag-1; d >= mindiag; d--) + if (lasta[d] == 0) + break; + else + lasta[d] = 0; } } @@ -1448,9 +1521,9 @@ void Match_Filter(char *aname, HITS_DB *ablock, char *bname, HITS_DB *bblock, if (VERBOSE) { if (comp) - printf("\nComparing %s to %s\n",aname,bname); - else printf("\nComparing c(%s) to %s\n",aname,bname); + else + printf("\nComparing %s to %s\n",aname,bname); } if (alen == 0 || blen == 0) @@ -1625,7 +1698,7 @@ void Match_Filter(char *aname, HITS_DB *ablock, char *bname, HITS_DB *bblock, khit[nhits].aread = 0x7fffffff; khit[nhits].bread = 0x7fffffff; - khit[nhits].bpos = 0x7fffffff; + khit[nhits].diag = 0x7fffffff; khit[nhits].apos = 0; #ifdef TEST_CSORT @@ -1660,19 +1733,20 @@ void Match_Filter(char *aname, HITS_DB *ablock, char *bname, HITS_DB *bblock, } parmr[NTHREADS-1].end = nhits; - w = ((ablock->maxlen >> Binshift) - (-bblock->maxlen >> Binshift)) + 1; - counters = (int *) Malloc(NTHREADS*4*w*sizeof(int),"Allocating diagonal buckets"); + w = ((ablock->maxlen >> Binshift) - ((-bblock->maxlen) >> Binshift)) + 1; + counters = (int *) Malloc(NTHREADS*3*w*sizeof(int),"Allocating diagonal buckets"); if (counters == NULL) exit (1); - for (i = 0; i < 4*w*NTHREADS; i++) + for (i = 0; i < 3*w*NTHREADS; i++) counters[i] = 0; for (i = 0; i < NTHREADS; i++) { if (i == 0) parmr[i].score = counters - ((-bblock->maxlen) >> Binshift); else - parmr[i].score = parmr[i-1].lastp + w; + parmr[i].score = parmr[i-1].lasta + w; parmr[i].lastp = parmr[i].score + w; + parmr[i].lasta = parmr[i].lastp + w; parmr[i].work = New_Work_Data(); parmr[i].aspec = aspec;